In [7]:
!pip install wordfreq
from wordfreq import word_frequency
import spacy



In [8]:
def check_freq(sentence):
  nlp = spacy.load("en_core_web_sm")
  words = [tok.lemma_ for tok in nlp(sentence) if tok.pos_ not in ["PUNCT", "SPACE"]]
  freq_dict = {}
  for word in words:
    freq = word_frequency(word, 'en')
    freq_dict[word] = freq
  vocab = dict(sorted(freq_dict.items(), key=lambda item: item[1]))
  return vocab

In [9]:
### Words Frequency
import pandas as pd
def compare_sentence_freq(sentence1, sentence2):
    freq_dict1 = check_freq(sentence1)
    freq_dict2 = check_freq(sentence2)
    low_freq_count1 = sum(1 for freq in freq_dict1.values() if freq < 1e-4)
    low_freq_count2 = sum(1 for freq in freq_dict2.values() if freq < 1e-4)
    return 0 if low_freq_count1 > low_freq_count2 else 1

def compare_min_word_freq(sentence1, sentence2):
    freq_dict1 = check_freq(sentence1)
    freq_dict2 = check_freq(sentence2)
    min_freq1 = min(freq_dict1.values()) if freq_dict1 else float('inf')
    min_freq2 = min(freq_dict2.values()) if freq_dict2 else float('inf')
    return 0 if min_freq1 < min_freq2 else 1

df_sum = pd.read_csv('summarization.csv')
df_baseline = pd.read_csv('baseline.csv')
count_compare_sentence_freq = 0
count_compare_min_word_freq = 0
for (sentence1, sentence2) in zip(df_sum.iloc[:, 1], df_baseline.iloc[:, 2]):
    count_compare_sentence_freq += compare_sentence_freq(sentence1, sentence2)
    count_compare_min_word_freq += compare_min_word_freq(sentence1, sentence2)

rate_compare_sentence_freq = count_compare_sentence_freq / len(df_sum)
rate_compare_min_word_freq = count_compare_min_word_freq / len(df_sum)
print("Sucess rate for compare_sentence_freq:", rate_compare_sentence_freq)
print("Sucess rate for compare_min_word_freq:", rate_compare_min_word_freq)

Sucess rate for compare_sentence_freq: 0.9393939393939394
Sucess rate for compare_min_word_freq: 0.8585858585858586


In [11]:
def average_sentence_length(text):
    sentences = text.split('.')
    word_counts = [len(sentence.split()) for sentence in sentences if sentence.strip()]
    if len(word_counts) == 0:
        return 0
    return sum(word_counts) / len(word_counts)

In [12]:
### Sentence Length
def compare_average_sentence_length(sentence1, sentence2):
    avg_len1 = average_sentence_length(sentence1)
    avg_len2 = average_sentence_length(sentence2)
    return 0 if avg_len1 > avg_len2 else 1
df_sum = pd.read_csv('summarization.csv')
df_baseline = pd.read_csv('baseline.csv')
results = [compare_average_sentence_length(sentence1, sentence2)
           for sentence1, sentence2 in zip(df_sum.iloc[:, 1], df_baseline.iloc[:, 2])]
rate_of_1 = sum(results) / len(results)
print("Sucess rate:", rate_of_1)

Sucess rate: 0.7878787878787878


In [13]:
import pandas as pd

def total_sentence_length(text):
    sentences = text.split('.')
    word_counts = [len(sentence.split()) for sentence in sentences if sentence.strip()]
    return sum(word_counts)

# Function to compare the total sentence lengths of two texts
def compare_total_sentence_length(text1, text2):
    total_len1 = total_sentence_length(text1)
    total_len2 = total_sentence_length(text2)
    return 0 if total_len1 > total_len2 else 1

df_sum = pd.read_csv('summarization.csv')
df_baseline = pd.read_csv('baseline.csv')

# Compare the total sentence lengths and calculate the rate of texts with shorter lengths
results = [compare_total_sentence_length(text1, text2)
           for text1, text2 in zip(df_sum.iloc[:, 1], df_baseline.iloc[:, 2])]
rate_of_1 = sum(results) / len(results)
print("Success rate:", rate_of_1)


Success rate: 1.0


In [14]:
### Readability
!pip install textstat
import textstat
import pandas as pd

def compare_readability(sentence1, sentence2):
    score1 = textstat.flesch_kincaid_grade(sentence1)
    score2 = textstat.flesch_kincaid_grade(sentence2)
    return 0 if score1 > score2 else 1

df_sum = pd.read_csv('summarization.csv')
df_baseline = pd.read_csv('baseline.csv')
results = [compare_readability(sentence1, sentence2)
           for sentence1, sentence2 in zip(df_sum.iloc[:, 1], df_baseline.iloc[:, 2])]
rate_of_1 = sum(results) / len(results)
print("Sucess rate:", rate_of_1)

Sucess rate: 0.6161616161616161


In [None]:
###Manully Evaluation Methods
##1. Key points: Both did well
##2. Non-Redundancy: Our model reaches success rate of 1.0
##3. Fidelity to Original Intent: Both did well
##4. Scores for two summarization version: Our model reaches success rate of 0.92
### In conclusion, our model performs better in summarization.