In [None]:
# !pip install spacy
# !pip install spacy_syllables
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_sm
# !pip install pandarallel

In [None]:
# import pandas as pd
# import numpy as np
# import re
#Importing everything from NLP Week 1 - following that as a guide for now
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import re
import random
import pandas as pd
import seaborn as sns
from collections import Counter, defaultdict
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
pd.options.display.max_rows = 100
pd.options.display.width = 150
RANDOM_SEED = 696


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import re
import spacy
from spacy_syllables import SpacySyllables
from pandarallel import pandarallel

In [None]:
pd.get_option("display.width")

# using https://spacy.io/universe/project/spacy_syllables
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("syllables", after="tagger")

pandarallel.initialize(progress_bar=True)

In [None]:
WikiLarge_Train_df = pd.read_csv(r'assets/WikiLarge_Train.csv')

In [None]:
# split_pat = re.compile(r'\b\s+\b')
split_pat = re.compile(r'\w+')

WikiLarge_Train_df['og_split'] = WikiLarge_Train_df['original_text'].parallel_apply(lambda x: re.findall(split_pat, x))
WikiLarge_Train_df['total_words'] = WikiLarge_Train_df['og_split'].parallel_apply(lambda x: len(x))
WikiLarge_Train_df['long_words'] = WikiLarge_Train_df['og_split'].parallel_apply(lambda x: len([y for y in x if len(y) > 7]))
WikiLarge_Train_df['total_sentences'] = 1
WikiLarge_Train_df['total_characters'] = WikiLarge_Train_df['og_split'].parallel_apply(lambda x: sum([len(y) for y in x]))

In [None]:
# Runs slow
# Get Syllables
WikiLarge_Train_df['syl_list'] = WikiLarge_Train_df['original_text'].parallel_apply(lambda x: [token._.syllables_count for token in nlp(x)])

In [None]:
WikiLarge_Train_df['syl_list'] = WikiLarge_Train_df['syl_list'].parallel_apply(lambda x: [y for y in x if y is not None])
WikiLarge_Train_df['total_syllables'] = WikiLarge_Train_df['syl_list'].parallel_apply(lambda x: sum(x))
WikiLarge_Train_df['total_polysyllables'] = WikiLarge_Train_df['syl_list'].parallel_apply(lambda x: sum([1 for y in x if y>2]))


In [None]:
WikiLarge_Train_df['total_unique_words'] = WikiLarge_Train_df['og_split'].parallel_apply(lambda x: len(set(x)))

In [None]:
# esitmates the years of formal education a person needs to understand the tet on first reading.
WikiLarge_Train_df['gfi'] = 0.4 * (WikiLarge_Train_df['total_words'] + 100 * WikiLarge_Train_df['long_words'])
# WikiLarge_Train_df['gfi']

In [None]:
# FRE ( Flesch reading ease) assigns higher values to more readable texts.
WikiLarge_Train_df['fre'] = 206.835 - 1.015*(WikiLarge_Train_df['total_words']) - 84.6 * (WikiLarge_Train_df['total_syllables']/WikiLarge_Train_df['total_words'])
# WikiLarge_Train_df['fre']

In [None]:
# (FKGL) Flesch-Kincaid grade level is the number of years of education generally required to understand the text for which the formula was calculated
WikiLarge_Train_df['fkgl'] = 0.39 * (WikiLarge_Train_df['total_words']) + 11.8 * (WikiLarge_Train_df['total_syllables']/WikiLarge_Train_df['total_words']) - 15.59
# WikiLarge_Train_df['fkgl']

In [None]:
# ARI (Automated readability index) years of education required to understand the text
WikiLarge_Train_df['ari'] = 4.71 * (WikiLarge_Train_df['total_characters'] / WikiLarge_Train_df['total_words']) + 0.5 * (WikiLarge_Train_df['total_words']) - 21.43
# WikiLarge_Train_df['ari']

In [None]:
# SMOG (simple measurement of Gobbledygook) roughly corresponds to the number of years of education needed to understand the text
WikiLarge_Train_df['smog'] = 1.0430 * np.sqrt(WikiLarge_Train_df['total_polysyllables'] * 30) + 3.1291
# WikiLarge_Train_df['smog']

In [None]:
# TTR (Type Token Ratio) (number of unique words / number of words)
WikiLarge_Train_df['ttr'] = WikiLarge_Train_df['total_unique_words'] / WikiLarge_Train_df['total_words']

# RTTR (root type token ratio)
WikiLarge_Train_df['rttr'] = WikiLarge_Train_df['total_unique_words'] / np.sqrt(WikiLarge_Train_df['total_words'])

# CTTR (corrected type token ratio)
WikiLarge_Train_df['cttr'] = WikiLarge_Train_df['total_unique_words'] / np.sqrt(2 * WikiLarge_Train_df['total_words'])

In [None]:
# MSTTR is the average TTR for each non-overlapping segment of equal size
# Assuming a 300 wpm average reading rate, we can assume a 5 wps = 300 wpm / 60 s reading rate. 
# Therefore, using a segment size of 5 is an appropriate window.
# http://crr.ugent.be/papers/Brysbaert_JML_2019_Reading_rate.pdf

def msttr_helper(lst:list, segment_size:int=np.NaN):
    if np.isnan(segment_size):
        segment_size=len(lst) 
    lst = [x.lower() for x in lst]
    segments = [lst[i*segment_size: i*segment_size + segment_size] for i in range(int(np.ceil(len(lst)/segment_size)))]
    segment_ttr_vals = [len(set(x)) / segment_size if len(x) == segment_size else len(set(x)) / len(x) for x in segments]
    
    return np.sum(segment_ttr_vals)/len(segment_ttr_vals)

WikiLarge_Train_df['5gram_msttr'] = WikiLarge_Train_df['og_split'].parallel_apply(msttr_helper, segment_size=5)
WikiLarge_Train_df['3gram_msttr'] = WikiLarge_Train_df['og_split'].parallel_apply(msttr_helper, segment_size=3)
WikiLarge_Train_df['2gram_msttr'] = WikiLarge_Train_df['og_split'].parallel_apply(msttr_helper, segment_size=2)

# WikiLarge_Train_df[['5gram_msttr', '3gram_msttr', '2gram_msttr']]

In [None]:
# MATTR is the average TTR for all possible overlapping segments of equal size. 
# Assuming a 300 wpm average reading rate, we can assume a 5 wps = 300 wpm / 60 s reading rate. 
# Therefore, using a segment size of 5 is an appropriate window.
# http://crr.ugent.be/papers/Brysbaert_JML_2019_Reading_rate.pdf

def mattr_helper(lst:list, segment_size:int=np.NaN):
    if np.isnan(segment_size):
        segment_size=len(lst) 
    lst = [x.lower() for x in lst]
    segments = [lst[i: i + segment_size] for i in range(len(lst))]
    segment_ttr_vals = [len(set(x)) / segment_size if len(x) == segment_size else len(set(x))/len(x) for x in segments ]
    
    return np.sum(segment_ttr_vals)/len(segment_ttr_vals)

WikiLarge_Train_df['5gram_mattr'] = WikiLarge_Train_df['og_split'].parallel_apply(mattr_helper, segment_size=5)
WikiLarge_Train_df['len_ngram_mattr'] = WikiLarge_Train_df['og_split'].parallel_apply(mattr_helper)


# WikiLarge_Train_df[['5gram_mattr', 'len_ngram_mattr']]

In [None]:
# This helper calculates the moving average of syllables.
# Assuming a 300 wpm average reading rate, we can assume a 5 wps = 300 wpm / 60 s reading rate. 
# Therefore, using a segment size of 5 is an appropriate window.
# http://crr.ugent.be/papers/Brysbaert_JML_2019_Reading_rate.pdf

def ma_syl_helper(lst:list, segment_size:int=np.NaN):
    if np.isnan(segment_size):
        segment_size=len(lst) 
    segments = [lst[i: i + segment_size] for i in range(len(lst))]
    segment_ttr_vals = [sum(x) / segment_size if len(x) == segment_size else sum(x)/len(x) for x in segments ]
    
    return np.sum(segment_ttr_vals)/len(segment_ttr_vals)

WikiLarge_Train_df['5gram_ma_syl'] = WikiLarge_Train_df['syl_list'].parallel_apply(ma_syl_helper, segment_size=5)
WikiLarge_Train_df['len_ngram_ma_syl'] = WikiLarge_Train_df['syl_list'].parallel_apply(ma_syl_helper)
WikiLarge_Train_df['syl_mean'] = WikiLarge_Train_df['syl_list'].parallel_apply(np.mean)
WikiLarge_Train_df['syl_std'] = WikiLarge_Train_df['syl_list'].parallel_apply(np.std)


# WikiLarge_Train_df[['5gram_ma_syl', 'len_ngram_ma_syl', 'syl_mean', 'syl_std']]

In [None]:
WikiLarge_Train_df.iloc[:, 3:].to_csv('score_features.csv')