In [16]:
!pip install wordfreq
from wordfreq import word_frequency
import spacy



In [2]:
def check_freq(sentence):
  nlp = spacy.load("en_core_web_sm")
  words = [tok.lemma_ for tok in nlp(sentence) if tok.pos_ not in ["PUNCT", "SPACE"]]
  freq_dict = {}
  for word in words:
    freq = word_frequency(word, 'en')
    freq_dict[word] = freq
  vocab = dict(sorted(freq_dict.items(), key=lambda item: item[1]))
  return vocab

In [3]:
### Words Frequency
import pandas as pd
def compare_sentence_freq(sentence1, sentence2):
    freq_dict1 = check_freq(sentence1)
    freq_dict2 = check_freq(sentence2)
    low_freq_count1 = sum(1 for freq in freq_dict1.values() if freq < 1e-4)
    low_freq_count2 = sum(1 for freq in freq_dict2.values() if freq < 1e-4)
    return 0 if low_freq_count1 > low_freq_count2 else 1

def compare_min_word_freq(sentence1, sentence2):
    freq_dict1 = check_freq(sentence1)
    freq_dict2 = check_freq(sentence2)
    min_freq1 = min(freq_dict1.values()) if freq_dict1 else float('inf')
    min_freq2 = min(freq_dict2.values()) if freq_dict2 else float('inf')
    return 0 if min_freq1 < min_freq2 else 1

df_simplification = pd.read_csv('simplification.csv')
df_baseline = pd.read_csv('baseline.csv')
count_compare_sentence_freq = 0
count_compare_min_word_freq = 0
for (sentence1, sentence2) in zip(df_simplification.iloc[:, 1], df_baseline.iloc[:, 1]):
    count_compare_sentence_freq += compare_sentence_freq(sentence1, sentence2)
    count_compare_min_word_freq += compare_min_word_freq(sentence1, sentence2)

rate_compare_sentence_freq = count_compare_sentence_freq / len(df_simplification)
rate_compare_min_word_freq = count_compare_min_word_freq / len(df_simplification)
print("Sucess rate for compare_sentence_freq:", rate_compare_sentence_freq)
print("Sucess rate for compare_min_word_freq:", rate_compare_min_word_freq)

Sucess rate for compare_sentence_freq: 0.8686868686868687
Sucess rate for compare_min_word_freq: 0.8686868686868687


In [5]:
def average_sentence_length(text):
    sentences = text.split('.')
    word_counts = [len(sentence.split()) for sentence in sentences if sentence.strip()]
    if len(word_counts) == 0:
        return 0
    return sum(word_counts) / len(word_counts)

In [7]:
### Sentence Length
def compare_average_sentence_length(sentence1, sentence2):
    avg_len1 = average_sentence_length(sentence1)
    avg_len2 = average_sentence_length(sentence2)
    return 0 if avg_len1 > avg_len2 else 1

df_simplification = pd.read_csv('simplification.csv')
df_baseline = pd.read_csv('baseline.csv')
results = [compare_average_sentence_length(sentence1, sentence2)
           for sentence1, sentence2 in zip(df_simplification.iloc[:, 1], df_baseline.iloc[:, 1])]
rate_of_1 = sum(results) / len(results)
print("Sucess rate:", rate_of_1)

Sucess rate: 0.9696969696969697


In [15]:
### Readability
!pip install textstat
import textstat
import pandas as pd

def compare_readability(sentence1, sentence2):
    score1 = textstat.flesch_kincaid_grade(sentence1)
    score2 = textstat.flesch_kincaid_grade(sentence2)
    return 0 if score1 > score2 else 1

df_simplification = pd.read_csv('simplification.csv')
df_baseline = pd.read_csv('baseline.csv')
results = [compare_readability(sentence1, sentence2)
           for sentence1, sentence2 in zip(df_simplification.iloc[:, 1], df_baseline.iloc[:, 1])]
rate_of_1 = sum(results) / len(results)
print("Sucess rate:", rate_of_1)

Sucess rate: 1.0


In [33]:
### Lexical Diversity
from lexicalrichness import LexicalRichness
import pandas as pd

def calculate_lexical_diversity_measures(sentence):
    lex = LexicalRichness(sentence)
    hdd_draws = min(lex.words, 50)
    return {
        'ttr': lex.ttr,  # Type-Token Ratio
        'hdd': lex.hdd(draws=hdd_draws) if lex.words > 0 else 0 # HDD with draws
    }

def compare_lexical_diversity(sentence1, sentence2, measure):
    diversity1 = calculate_lexical_diversity_measures(sentence1)
    diversity2 = calculate_lexical_diversity_measures(sentence2)
    return 0 if diversity1[measure] > diversity2[measure] else 1

df_simplification = pd.read_csv('simplification.csv')
df_baseline = pd.read_csv('baseline.csv')
measures = ['ttr', 'hdd']
for measure in measures:
    results = [compare_lexical_diversity(sentence1, sentence2, measure)
               for sentence1, sentence2 in zip(df_simplification.iloc[:, 1], df_baseline.iloc[:, 1])]
    rate_of_1 = sum(results) / len(results)
    print(f"Sucess rate for {measure.upper()}: {rate_of_1}")

Sucess rate for TTR: 0.8181818181818182
Sucess rate for HDD: 0.797979797979798
