In [1]:
#!pip install textstat

## Import necessary packages and modules

In [1]:
import pandas as pd
from os import listdir
import textstat
from sklearn.model_selection import train_test_split

## Calculate readability metrics
We calculate the following:
1. Flesch Kincaid Grade
2. Flesch Reading Ease
3. Gunning Fox
4. SMOG Index 

### Experiments results

In [6]:
files = listdir(".")
files = [f for f in files if f.endswith(".csv")]

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
col = "New"
print(col)
for f in files:
    df = pd.read_csv(f, index_col = 0)
    df.reset_index(inplace=True, drop=True)
    df["Flesh_Kincaid_Grade"] = 0
    df["Flesh_Reading_Ease"] = 0
    df["Gunning_Fog"] = 0
    df["Smog_Index"] = 0
    for i in range(df.shape[0]):
        df["Flesh_Kincaid_Grade"][i] = textstat.flesch_kincaid_grade(df[col][i])
        df["Flesh_Reading_Ease"][i] = textstat.flesch_reading_ease(df[col][i])
        df["Gunning_Fog"][i] = textstat.gunning_fog(df[col][i])
        df["Smog_Index"][i] = textstat.smog_index(df[col][i])
#     fp = "metrics/pol_readmet_"+f
#     df.to_csv(fp)
    print(f"{f}")
    print(f"Average Flesh Kincaid Grade: {df['Flesh_Kincaid_Grade'].mean()}")
    print(f"Average Flesh Reading Ease: {df['Flesh_Reading_Ease'].mean()}")
    print(f"Average Gunning Fog: {df['Gunning_Fog'].mean()}")
    print(f"Average Smog Index: {df['Smog_Index'].mean()}")
    print("")
    
        

New
bert_par.csv
Average Flesh Kincaid Grade: 13.86463878326996
Average Flesh Reading Ease: 30.51912547528517
Average Gunning Fog: 16.410950570342205
Average Smog Index: 0.25741444866920155

bert_sent.csv
Average Flesh Kincaid Grade: 13.86463878326996
Average Flesh Reading Ease: 30.51912547528517
Average Gunning Fog: 16.410950570342205
Average Smog Index: 0.25741444866920155

dist_sent.csv
Average Flesh Kincaid Grade: 12.89134199134199
Average Flesh Reading Ease: 37.77632034632035
Average Gunning Fog: 15.403939393939394
Average Smog Index: 0.17445887445887445



### Original data

#### Paragraph data

In [43]:
df=pd.read_csv("../../dist_par_all/nature_paragraph_data.csv")
X_all = df['TEXT'].values
y_all = df['CATEGORY'].values

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42, stratify=y_all)

##### Complex data

In [44]:
scient_test = X_test[y_test == 0]
results_list = []
for i in scient_test:
    try:
        results_list.append([i, 
                             textstat.flesch_reading_ease(i), 
                             textstat.flesch_kincaid_grade(i),
                             textstat.gunning_fog(i),
                             textstat.smog_index(i)])
    except:
        continue;
 
df = pd.DataFrame(results_list, 
                  columns=['Original', 'Flesh_Kincaid_Grade', 'Flesh_Reading_Ease', 'Gunning_Fog', 'Smog_Index'])
df.to_csv("scient_par_readability.csv")

print(f"Average Flesh Kincaid Grade: {df['Flesh_Kincaid_Grade'].mean()}")
print(f"Average Flesh Reading Ease: {df['Flesh_Reading_Ease'].mean()}")
print(f"Average Gunning Fog: {df['Gunning_Fog'].mean()}")
print(f"Average Smog Index: {df['Smog_Index'].mean()}")
print("")

Average Flesh Kincaid Grade: 31.462577777777778
Average Flesh Reading Ease: 13.887111111111112
Average Gunning Fog: 14.949288888888889
Average Smog Index: 15.375111111111114



##### Plain data

In [45]:
plain_test = X_test[y_test == 1]
results_list = []
for i in plain_test:
    try:
        results_list.append([i, 
                             textstat.flesch_reading_ease(i), 
                             textstat.flesch_kincaid_grade(i),
                             textstat.gunning_fog(i),
                             textstat.smog_index(i)])
    except:
        continue;
 
df = pd.DataFrame(results_list, 
                  columns=['Original', 'Flesh_Kincaid_Grade', 'Flesh_Reading_Ease', 'Gunning_Fog', 'Smog_Index'])
df.to_csv("plain_par_readability.csv")

print(f"Average Flesh Kincaid Grade: {df['Flesh_Kincaid_Grade'].mean()}")
print(f"Average Flesh Reading Ease: {df['Flesh_Reading_Ease'].mean()}")
print(f"Average Gunning Fog: {df['Gunning_Fog'].mean()}")
print(f"Average Smog Index: {df['Smog_Index'].mean()}")
print("")

Average Flesh Kincaid Grade: 48.23083333333333
Average Flesh Reading Ease: 11.464492753623187
Average Gunning Fog: 12.282789855072464
Average Smog Index: 13.251811594202898



#### Sentence data

In [46]:
df=pd.read_csv("../../bert_sent_all/nature_sentences_data.csv")
X_all = df['TEXT'].values
y_all = df['CATEGORY'].values

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42, stratify=y_all)

##### Complex data

In [47]:
scient_test = X_test[y_test == 0]
results_list = []
for i in scient_test:
    try:
        results_list.append([i, 
                             textstat.flesch_reading_ease(i), 
                             textstat.flesch_kincaid_grade(i),
                             textstat.gunning_fog(i),
                             textstat.smog_index(i)])
    except:
        continue;
 
df = pd.DataFrame(results_list, 
                  columns=['Original', 'Flesh_Kincaid_Grade', 'Flesh_Reading_Ease', 'Gunning_Fog', 'Smog_Index'])
df.to_csv("scient_sent_readability.csv")

print(f"Average Flesh Kincaid Grade: {df['Flesh_Kincaid_Grade'].mean()}")
print(f"Average Flesh Reading Ease: {df['Flesh_Reading_Ease'].mean()}")
print(f"Average Gunning Fog: {df['Gunning_Fog'].mean()}")
print(f"Average Smog Index: {df['Smog_Index'].mean()}")
print("")

Average Flesh Kincaid Grade: 30.68188869412796
Average Flesh Reading Ease: 14.108545135845748
Average Gunning Fog: 16.999176161262053
Average Smog Index: 0.4113935144609991



##### Plain data

In [48]:
plain_test = X_test[y_test == 1]
results_list = []
for i in plain_test:
    try:
        results_list.append([i, 
                             textstat.flesch_reading_ease(i), 
                             textstat.flesch_kincaid_grade(i),
                             textstat.gunning_fog(i),
                             textstat.smog_index(i)])
    except:
        continue;
 
df = pd.DataFrame(results_list, 
                  columns=['Original', 'Flesh_Kincaid_Grade', 'Flesh_Reading_Ease', 'Gunning_Fog', 'Smog_Index'])
df.to_csv("plain_sent_readability.csv")

print(f"Average Flesh Kincaid Grade: {df['Flesh_Kincaid_Grade'].mean()}")
print(f"Average Flesh Reading Ease: {df['Flesh_Reading_Ease'].mean()}")
print(f"Average Gunning Fog: {df['Gunning_Fog'].mean()}")
print(f"Average Smog Index: {df['Smog_Index'].mean()}")
print("")

Average Flesh Kincaid Grade: 47.02201425356339
Average Flesh Reading Ease: 11.74482370592648
Average Gunning Fog: 14.076211552888223
Average Smog Index: 0.40948987246811697

