ETL script for textstat metrics on the text of each wikipedia article: (textstat is a python package that calculates various metrics on sentence complexity, such as the Dale–Chall formula, or the Flesch Reading Ease score) 

In [1]:
# Need to install textstat
!pip install textstat



In [2]:
import pandas as pd
import textstat 

In [3]:
article_df = pd.read_csv('Results/article_text1.csv', delimiter=',', encoding='utf-8')

In [4]:
article_df.shape

(19815, 4)

In [5]:
article_df.head(5)

Unnamed: 0,title,text,wiki_link,redirect
0,AccessibleComputing,[],['Computer accessibility'],T
1,Anarchism,"['rejects', 'deemed', 'unjust', 'advocates', '...","['Anti-authoritarianism', 'Political philosoph...",F
2,AfghanistanHistory,[],['History of Afghanistan'],T
3,AfghanistanGeography,[],['Geography of Afghanistan'],T
4,AfghanistanPeople,[],['Demographics of Afghanistan'],T


In [6]:
textstat.set_lang("en")

In [7]:
def textstat_stats(text):
    difficulty = textstat.flesch_reading_ease(text)
    grade_difficulty = textstat.flesch_kincaid_grade(text)
    gfog = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)
    ari = textstat.automated_readability_index(text)
    cli = textstat.coleman_liau_index(text)
    lwf = textstat.linsear_write_formula(text)
    dcrs = textstat.dale_chall_readability_score(text)
    idx = ['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']

    return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs], index=idx)

https://pypi.org/project/textstat/

In [10]:
temp_df = article_df.apply(lambda x: textstat_stats(x['text']), axis=1)

In [11]:
textstat_df = pd.concat([article_df, temp_df], axis=1, sort=False)

In [16]:
textstat_df.head(10)

Unnamed: 0,title,text,wiki_link,redirect,difficulty,grade_difficulty,gfog,smog,ari,cli,lwf,dcrs
0,AccessibleComputing,[],['Computer accessibility'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
1,Anarchism,"['rejects', 'deemed', 'unjust', 'advocates', '...","['Anti-authoritarianism', 'Political philosoph...",F,-3604.54,1399.2,1429.44,0.0,1805.6,27.82,89.0,185.28
2,AfghanistanHistory,[],['History of Afghanistan'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
3,AfghanistanGeography,[],['Geography of Afghanistan'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
4,AfghanistanPeople,[],['Demographics of Afghanistan'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
5,AfghanistanCommunications,[],['Communications in Afghanistan'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
6,AfghanistanTransportations,[],['Transport in Afghanistan'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
7,AfghanistanMilitary,[],['Afghan Armed Forces'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
8,AfghanistanTransnationalIssues,[],['Foreign relations of Afghanistan'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0
9,AssistiveTechnology,[],['Assistive_technology'],T,206.84,-15.7,0.0,0.0,0.0,-15.81,-0.5,0.0


In [14]:
textstat_df.to_csv (r'Results/textstat.csv', index = False, header=True)