In [None]:
import pandas as pd


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import chardet

csv_path = '/content/drive/MyDrive/NLP/small.csv'

with open(csv_path, 'rb') as f:
    result = chardet.detect(f.read())
encoding = result['encoding']

df = pd.read_csv(csv_path, encoding=encoding)

In [None]:
abstract_df = df['abstract']


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string


In [None]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def preprocess_text(text):
    text = text.lower()    #convert text to lowercase

    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = ''.join(char for char in text if not char.isdigit())    #remove special characters

    
    tokens = nltk.word_tokenize(text)    #tokenize the text

    
    stopwords_set = set(stopwords.words('english'))    #remove stopwords
    filtered_tokens = [token for token in tokens if token not in stopwords_set]
    
    preprocessed_text = ' '.join(filtered_tokens)    #join the tokens back to string

    
    return preprocessed_text


In [None]:
df['Preprocessed Abstract'] = df['abstract'].apply(preprocess_text)


In [None]:
import nltk
from nltk.tokenize import word_tokenize

df['Preprocessed Tokens'] = df['Preprocessed Abstract'].apply(word_tokenize)
for index, row in df.iterrows():
    print(f"Sentence {index+1} Tokens: {row['Preprocessed Tokens']}")


Sentence 1 Tokens: ['study', 'two', 'dairy', 'compost', 'heaps', 'one', 'vegetable', 'scrap', 'compost', 'heap', 'constructed', 'two', 'research', 'farms', 'clemson', 'sc', 'samples', 'taken', 'fr', 'om', 'ach', 'heap', 'different', 'locations', 'elected', 'intervals', 'analyzed', 'enterococ', 'ci', 'vre', 'counts', 'spread', 'plating', 'bile', 'esculin', 'agar', 'bea', 'bea', 'containing', '¬gml', 'vancomycin', 'respectively', 'initial', 'populations', 'enterococci', 'vre', 'compost', 'range', 'log', 'cfug', 'respectively', 'days', 'active', 'composting', 'average', 'vre', 'populations', 'declined', 'ca', 'logs', 'top', 'center', 'bottom', 'surface', 'locations', 'heaps', 'respectively', 'whereas', 'enterococci', 'populations', 'declined', 'logs', 'respectively', 'two', 'dairy', 'compost', 'trials', 'temperatures', 'top', 'center', 'bottom', 'locations', 'excess', 'øc', 'ƒ', 'days', 'respectively', 'whereas', 'temperatures', 'surface', 'samples', 'never', 'exceeded', 'øc', 'thermophil

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from transformers import AutoTokenizer, T5ForConditionalGeneration
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
df['Preprocessed Tokens'] = df['Preprocessed Abstract'].apply(word_tokenize)

def extract_keywords(sentence):
    keywords = []
    pos_tags = pos_tag(sentence)
    ner_tags = ne_chunk(pos_tags)
    for tag in ner_tags:
        if hasattr(tag, 'label') and tag.label() == 'PERSON':  #extracting person names
            keywords.append(tag[0][0])
        elif hasattr(tag, 'label') and tag.label() == 'ORGANIZATION':  #extracting organization names
            keywords.append(tag[0][0])
        elif tag[1] in ['NN', 'NNS', 'NNP', 'NNPS']:  #extracting nouns
            keywords.append(tag[0])
    
    return keywords

def calculate_score(title, keywords):
    title_tokens = set(word_tokenize(title))
    keyword_tokens = set(keywords)
    intersection = title_tokens.intersection(keyword_tokens)
    score = len(intersection) / len(keyword_tokens)
    return score

tot_score = 0
num_sen = 0

for index, row in df.iterrows():
    abstract = row['Preprocessed Abstract']
    extracted_keywords = extract_keywords(row['Preprocessed Tokens'])
    
    inputs = tokenizer.encode("summarize: " + abstract, truncation=True, max_length=512, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
    generated_title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    score = calculate_score(generated_title, extracted_keywords)

    print(f"Sentence {index+1} Extracted Keywords: {extracted_keywords}")
    print(f"Sentence {index+1} Generated Title: {generated_title}")
    print(f"Sentence {index+1} Score: {score}")
    tot_score += score
    num_sen += 1

avg_score = tot_score / num_sen

print(f"Total Score: {tot_score}")
print(f"Average Score: {avg_score}")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Sentence 1 Extracted Keywords: ['study', 'dairy', 'compost', 'scrap', 'compost', 'research', 'farms', 'sc', 'samples', 'ach', 'locations', 'intervals', 'ci', 'vre', 'counts', 'bile', 'esculin', 'agar', 'bea', 'bea', '¬gml', 'vancomycin', 'populations', 'vre', 'compost', 'range', 'log', 'cfug', 'days', 'populations', 'center', 'bottom', 'surface', 'locations', 'enterococci', 'populations', 'logs', 'dairy', 'compost', 'trials', 'temperatures', 'center', 'locations', 'ƒ', 'days', 'temperatures', 'samples', 'øc', 'activity', 'ph', 'production', 'ammonia', 'moisture', 'content', 'heaps', 'composting', 'trials']
Sentence 1 Generated Title: two dairy compost trials temperatures top center bottom locations excess c  days respectively whereas temperatures surface samples never exceeded c thermophilic composting microbial activity slightly reduced ph began rise slightly due production ammonia
Sentence 1 Score: 0.35714285714285715
Sentence 2 Extracted Keywords: ['describe', 'groups', 'terms', 'gr

In [None]:
keywords_list = []

for index, row in df.iterrows():
    tokens = row['Preprocessed Tokens']
    keywords_list.append(tokens)

for index, keywords in enumerate(keywords_list):
    print(f"Sentence {index+1} Keywords: {keywords}")


Sentence 1 Keywords: ['study', 'two', 'dairy', 'compost', 'heaps', 'one', 'vegetable', 'scrap', 'compost', 'heap', 'constructed', 'two', 'research', 'farms', 'clemson', 'sc', 'samples', 'taken', 'fr', 'om', 'ach', 'heap', 'different', 'locations', 'elected', 'intervals', 'analyzed', 'enterococ', 'ci', 'vre', 'counts', 'spread', 'plating', 'bile', 'esculin', 'agar', 'bea', 'bea', 'containing', '¬gml', 'vancomycin', 'respectively', 'initial', 'populations', 'enterococci', 'vre', 'compost', 'range', 'log', 'cfug', 'respectively', 'days', 'active', 'composting', 'average', 'vre', 'populations', 'declined', 'ca', 'logs', 'top', 'center', 'bottom', 'surface', 'locations', 'heaps', 'respectively', 'whereas', 'enterococci', 'populations', 'declined', 'logs', 'respectively', 'two', 'dairy', 'compost', 'trials', 'temperatures', 'top', 'center', 'bottom', 'locations', 'excess', 'øc', 'ƒ', 'days', 'respectively', 'whereas', 'temperatures', 'surface', 'samples', 'never', 'exceeded', 'øc', 'thermoph

In [None]:
print(df['Preprocessed Abstract'].head(28))


0     study two dairy compost heaps one vegetable sc...
1     describe geometric noncommutative formal group...
2     numerical solution partial differential equati...
3     qualitative case study hardly research techniq...
4     resumen en la actualidad est­n apareciendo un ...
5     theory functional discourse grammar assumes st...
6     software maintenance reuse require identificat...
7     article provides principles practice interfere...
8     many optimization techniques invented reduce n...
9     introduction problems faced cartilage grafts r...
10    recent paper l bokut v v chaynikov k p shum br...
11    threat soft error induced system failure compu...
12    ac transport losses multifilamentary bscco sil...
13    mg classic queueing system extended many autho...
14    edge systems reveal properties disk galaxies f...
15    analysis given model semimetal isotropic elect...
16    twin soii samples coliected sampling points ei...
17    nous tudions un modle de communication ÿ p

In [None]:
def calculate_sentence_score(sentence, keywords):
    score = 0
    words = sentence.split()
    keyword_score = sum(1 for word in words if word in keywords)
    score += keyword_score
    
    length_score = len(words)
    score += length_score    
    return score
print(score)

254.05263157894737
