# Load dataset

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
text_df = pd.read_csv('results.csv')

Mounted at /content/drive


In [2]:
text_df.head()

Unnamed: 0,text,label,Roberta_label,confidence,avg_zipf_score,commonality_score,adjusted_score
0,tw rofr postings legale - in toby ' s absence ...,0,0,3.9e-05,3.232222,0.236282,0.763718
1,Pls accept me for one day. Or am begging you c...,0,0,5.5e-05,5.178,0.161865,0.838135
2,re : [ 618 ] quality drugs at very reasonable ...,1,1,0.999935,4.26,0.190114,0.809886
3,"Once upon a time, Doug wrote :> Maybe I'm just...",0,0,3.2e-05,3.587143,0.218001,0.781999
4,"re : 6 . 1049 , sum : e - mail citation on occ...",0,0,5.4e-05,3.564,0.219106,0.780894


# Complexity

In [3]:
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [6]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import cmudict

# Ensure NLTK components are downloaded
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

# CMU Pronouncing Dictionary for syllable counting
d = cmudict.dict()

# Return the number of syllables in a word according to the CMU dict.
def nsyl(word):
    if word.lower() in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    else:
        # If the word isn't found in the CMU dict, assume 1 syllable
        return 1

def analyze_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return 0, 0, 0  # Return zeros for rows with non-string data
    words = word_tokenize(text)
    total_words = len(words)
    total_syllables = sum(nsyl(word) for word in words)
    total_characters = sum(len(word) for word in words)
    average_syllables_per_word = total_syllables / total_words if total_words else 0
    average_characters_per_word = total_characters / total_words if total_words else 0
    return total_words, average_syllables_per_word, average_characters_per_word

# Assuming 'data' is your DataFrame and 'Answer' is the column with the text
text_df[['total_words', 'average_syllables_per_word', 'average_characters_per_word']] = text_df['text'].apply(
    lambda x: pd.Series(analyze_text(x))
)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Readability

In [9]:
!pip install textstat
!pip install readability
import pandas as pd
import textstat
import readability



In [10]:
def add_textstat_readability_scores(row):
    text = row['text']
    if pd.isna(text) or not isinstance(text, str):
        row['ARI'] = None
        row['Flesch-Kincaid Grade'] = None
    else:
        row['ARI'] = textstat.automated_readability_index(text)
        row['Flesch-Kincaid Grade'] = textstat.flesch_kincaid_grade(text)
    return row

# Apply the function across the DataFrame
text_df = text_df.apply(add_textstat_readability_scores, axis=1)

In [11]:
# Make all 0 values in 'total_words', 'average_syllables_per_word', and 'average_characters_per_word NaN
import numpy as np
text_df['total_words'] = text_df['total_words'].replace(0, np.nan)
text_df['average_syllables_per_word'] = text_df['average_syllables_per_word'].replace(0, np.nan)
text_df['average_characters_per_word'] = text_df['average_characters_per_word'].replace(0, np.nan)

In [12]:
text_df.head()

Unnamed: 0,text,label,Roberta_label,confidence,avg_zipf_score,commonality_score,adjusted_score,total_words,average_syllables_per_word,average_characters_per_word,ARI,Flesch-Kincaid Grade
0,tw rofr postings legale - in toby ' s absence ...,0,0,3.9e-05,3.232222,0.236282,0.763718,56.0,1.375,3.732143,11.5,9.9
1,Pls accept me for one day. Or am begging you c...,0,0,5.5e-05,5.178,0.161865,0.838135,15.0,1.2,3.4,0.3,1.1
2,re : [ 618 ] quality drugs at very reasonable ...,1,1,0.999935,4.26,0.190114,0.809886,24.0,1.458333,3.166667,1.4,3.3
3,"Once upon a time, Doug wrote :> Maybe I'm just...",0,0,3.2e-05,3.587143,0.218001,0.781999,129.0,1.27907,4.581395,23.5,13.4
4,"re : 6 . 1049 , sum : e - mail citation on occ...",0,0,5.4e-05,3.564,0.219106,0.780894,367.0,1.351499,3.779292,13.7,12.0


In [18]:
text_df.groupby("label")["avg_zipf_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,3.350595,1.061204,0.0,2.8932,3.467273,3.714348,6.075
1,43.0,3.279134,1.102743,0.0,2.565833,3.459167,4.01,4.843529


In [19]:
text_df.groupby("label")["commonality_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,0.236537,0.079909,0.0,0.205105,0.222618,0.2525,0.649351
1,43.0,0.239523,0.082798,0.0,0.198654,0.217061,0.266402,0.534474


In [20]:
text_df.groupby("label")["adjusted_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,0.763463,0.079909,0.350649,0.7475,0.777382,0.794895,1.0
1,43.0,0.760477,0.082798,0.465526,0.733598,0.782939,0.801346,1.0


In [25]:
text_df.groupby("label")["total_words"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,253.859649,250.570452,6.0,56.0,176.0,393.0,1167.0
1,43.0,279.604651,715.635786,12.0,51.5,104.0,245.5,4745.0


In [21]:
text_df.groupby("label")["average_syllables_per_word"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,1.326018,0.139615,1.0,1.227273,1.31348,1.406828,1.762376
1,43.0,1.384855,0.16962,1.060606,1.279708,1.367816,1.458333,1.96


In [22]:
text_df.groupby("label")["average_characters_per_word"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,4.012337,0.770026,2.898785,3.507937,3.967742,4.333333,6.8
1,43.0,4.152447,0.929061,2.892473,3.422375,3.972222,4.570212,7.58


In [23]:
text_df.groupby("label")["ARI"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,12.478947,6.341297,-0.9,9.5,12.6,15.5,28.3
1,43.0,11.604651,7.404148,1.4,6.15,10.6,13.85,31.7


In [24]:
text_df.groupby("label")["Flesch-Kincaid Grade"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,57.0,8.680702,3.904596,0.5,6.4,8.5,10.6,19.8
1,43.0,8.432558,5.341692,0.5,4.95,7.6,9.9,24.0


In [26]:
text_df.to_csv('results_with_readability.csv', index=False)