In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from collections import Counter
from nltk.corpus import stopwords
from datasets import load_metric
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\17327\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\17327\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def generate_summary(text, n):
    sentences = sent_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.lower() not in stop_words and word.isalnum()]

    word_freq = Counter(words)

    sentence_scores = {}

    for sentence in sentences:
        sentence_words = [word.lower() for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()]
        sentence_score = sum([word_freq[word] for word in sentence_words])
        if len(sentence_words) < 20:
            sentence_scores[sentence] = sentence_score

    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:n]
    summary = ' '.join(summary_sentences)

    return summary

In [3]:
text = """Recently, my fiance (20 m) and I (19f) moved into a new apartment with a mutual friend (20m) and 
somehow contracted scabies (don't know how). We've both been itchy af and have been to the doctor who confirmed 
that it was scabies for the both of us. Our room mate (20m) has not had symptoms of scabies bites appear yet but 
I have asked him to get treated as well and to treat his clothes and linen so that our apartment does not get 
reinfested after treatment.My room mate refuses to buy the lotion needed to kill the mites on his skin 
(if there are any on him) and refuses to rewash and dry his linen and clothes. I'm scared that if he does not get 
treated the infestation of our apartment will not go away. I'm almost there to asking him to move out if he refuses 
treatment. He is not on the lease."""

In [14]:
summary = generate_summary(text, 5)
formatted_summary = [summary.replace('\n', '')]
print(formatted_summary)

["Recently, my fiance (20 m) and I (19f) moved into a new apartment with a mutual friend (20m) and somehow contracted scabies (don't know how). I'm scared that if he does not get treated the infestation of our apartment will not go away. We've both been itchy af and have been to the doctor who confirmed that it was scabies for the both of us. I'm almost there to asking him to move out if he refuses treatment. He is not on the lease."]


In [15]:
metric = load_metric("rouge")

In [16]:
reference_summary = ["Fiance and I recently got infected with scabies. Room mate refuses to get treated and our apartment will not go away. I'm afraid he will leave if he doesn't. Should I ask him to leave?"]  # Convert to a list

results = metric.compute(predictions=formatted_summary, references=reference_summary)

{'rouge1': AggregateScore(low=Score(precision=0.29545454545454547, recall=0.7027027027027027, fmeasure=0.416), mid=Score(precision=0.29545454545454547, recall=0.7027027027027027, fmeasure=0.416), high=Score(precision=0.29545454545454547, recall=0.7027027027027027, fmeasure=0.416)), 'rouge2': AggregateScore(low=Score(precision=0.11494252873563218, recall=0.2777777777777778, fmeasure=0.1626016260162602), mid=Score(precision=0.11494252873563218, recall=0.2777777777777778, fmeasure=0.1626016260162602), high=Score(precision=0.11494252873563218, recall=0.2777777777777778, fmeasure=0.1626016260162602)), 'rougeL': AggregateScore(low=Score(precision=0.19318181818181818, recall=0.4594594594594595, fmeasure=0.272), mid=Score(precision=0.19318181818181818, recall=0.4594594594594595, fmeasure=0.272), high=Score(precision=0.19318181818181818, recall=0.4594594594594595, fmeasure=0.272)), 'rougeLsum': AggregateScore(low=Score(precision=0.19318181818181818, recall=0.4594594594594595, fmeasure=0.272), m

In [17]:
rouge1 = []
rouge2 = []
rougeL = []

results = metric.compute(predictions = formatted_summary, references = reference_summary)
for key in ["rouge1", "rouge2", "rougeL"]:
    if key == "rouge1":
        rouge1.append(results[key].mid.fmeasure)
    elif key == "rouge2":
        rouge2.append(results[key].mid.fmeasure)
    else:
        rougeL.append(results[key].mid.fmeasure)

df_results_sample = pd.DataFrame({"text": text,
                                 "summary": reference_summary,
                                 "summary_pred": formatted_summary,
                                 "rouge1": rouge1,
                                 "rouge2": rouge2,
                                 "rougeL": rougeL}
                                 )

df_results_sample

Unnamed: 0,text,summary,summary_pred,rouge1,rouge2,rougeL
0,"Recently, my fiance (20 m) and I (19f) moved i...",Fiance and I recently got infected with scabie...,"Recently, my fiance (20 m) and I (19f) moved i...",0.416,0.162602,0.272


In [18]:
test_data = pd.read_csv('test_data.csv')
test_data.head()

Unnamed: 0,text,summary
0,"First things first - yes we're teenagers, youn...",Two months into relationship and I found out ...
1,"My partner's cousin, E, often invites us over ...",Partner's cousin and her husband frequently i...
2,It was my 21st birthday. My gf went with me to...,"Had drunken sex with my gf, only to find out ..."


In [19]:
input_text = []

for i in range(len(test_data)):
    input_text.append(test_data['text'][i])

summary = []

for i in range(len(input_text)):
    summary_pred = generate_summary(input_text[i], 5)
    summary_pred = summary_pred.replace('\n', '')
    summary.append(summary_pred)

rouge1 = []
rouge2 = []
rougeL = []

for i in range(len(summary)):
    results = metric.compute(predictions = [summary[i]], references = [test_data['summary'][i]])
    for key in ["rouge1", "rouge2", "rougeL"]:
        if key == "rouge1":
            rouge1.append(results[key].mid.fmeasure)
        elif key == "rouge2":
            rouge2.append(results[key].mid.fmeasure)
        else:
            rougeL.append(results[key].mid.fmeasure)

df_results = pd.DataFrame({"text": input_text,
                            "summary": test_data['summary'],
                            "summary_pred": summary,
                            "rouge1": rouge1,
                            "rouge2": rouge2,
                            "rougeL": rougeL}
                              )

df_results

Unnamed: 0,text,summary,summary_pred,rouge1,rouge2,rougeL
0,"First things first - yes we're teenagers, youn...",Two months into relationship and I found out ...,But I have recently been told that the girl I ...,0.283688,0.172662,0.198582
1,"My partner's cousin, E, often invites us over ...",Partner's cousin and her husband frequently i...,"It doesn't seem as though E, her husband [29m]...",0.268293,0.061728,0.134146
2,It was my 21st birthday. My gf went with me to...,"Had drunken sex with my gf, only to find out ...",My gf went with me to a strip club... first st...,0.180328,0.033333,0.098361
