-
Notifications
You must be signed in to change notification settings - Fork 0
/
AddingReadibility.py
63 lines (60 loc) · 2.56 KB
/
AddingReadibility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Date: 04/21/2017
# Updated DFarnand 4/27/2017
from TextCleaning import TextCleaning
import re
import string
from collections import Counter
from textstat.textstat import textstat
import pandas as pd
import numpy as np
def AddReadabilityMeasures(filename):
df = pd.read_csv(filename+'.csv',index_col=0) #Should be in the same working directory?
Flesch_Reading_Ease_Value=[]
Coleman_Liau_Index_Value=[]
Dale_Chall_Readability_Score=[]
Code_Count=[]
Latex_Count=[]
Text_Len=[]
#Punc_Count=[]
Punc_Rate=[]
Clean_Text=[]
Polar=[]
Subj=[]
for text in df['Body']:
try:
cleaned = TextCleaning(text) # Added nan insertion into TextCleaning
except TypeError:
print("TypeError (probably reduced to bad text):",text)
cleaned = TextCleaning('') # Just to get NA values
Flesch_Reading_Ease_Value.append(cleaned['flesch_reading_ease'])
Coleman_Liau_Index_Value.append(cleaned['coleman_liau_index'])
Dale_Chall_Readability_Score.append(cleaned['dale_chall_readability_score'])
Code_Count.append(cleaned['codeLen'])
Latex_Count.append(cleaned['latLen'])
Text_Len.append(cleaned['textLen'])
#Punc_Count.append(cleaned['punLen'])
Punc_Rate.append(cleaned['punRate'])
Clean_Text.append(cleaned['text'])
Polar.append(cleaned['polarity'])
Subj.append(cleaned['subjectivity'])
df['Flesch_Reading_Ease_Value']=Flesch_Reading_Ease_Value
df['Coleman_Liau_Index_Value']=Coleman_Liau_Index_Value
df['Dale_Chall_Readability_Score']=Dale_Chall_Readability_Score
df['Code_Count']=Code_Count
df['Latex_Count']=Latex_Count
df['Clean_Text']=Clean_Text
df['Text_Length']=Text_Len
#df['Punc_Rate']=(np.array(Punc_Count)/[len(x) for x in Clean_Text]).tolist()
df['Punc_Rate']=Punc_Rate
df['Polarity']=Polar
df['Subjectivity']=Subj
# df['ScoreLabel']= (np.log10(df['Score'])>np.log10(np.median(df['Score'])))*1 #Log because scores skewed
df['ScoreLabel'] = (np.log10(df['Score']) > np.log10(2)) * 1 # Score = 2 , Log because scores skewed
return df
## Commented to be able to quickly run the script for others
dataFrameAi = AddReadabilityMeasures('ai_posts')
dataFrameStats = AddReadabilityMeasures('stats_posts')
dataFrameIot=AddReadabilityMeasures('iot_posts')
dataFrameAi.to_csv('s2_ai_posts_with_readibility_measures.csv', index=False)
dataFrameIot.to_csv('s2_iot_posts_with_readibility_measures.csv', index=False)
dataFrameStats.to_csv('s2_stats_posts_with_readibility_measures.csv', index=False)