In [0]:
import pandas as pd
import os
import glob
import numpy as np

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
!pip install textstat




In [33]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
filelist = []
filesList = []
my_dir='/content/drive/My Drive/DeepLearning/DataScienceAssignment/StoryTextFiles'
os.chdir( my_dir )

In [0]:
file_names = os.listdir('/content/drive/My Drive/DeepLearning/DataScienceAssignment/StoryTextFiles/')
# Create Dictionary for File Name and Text
file_name_and_text = {}
for file in file_names:
    with open('/content/drive/My Drive/DeepLearning/DataScienceAssignment/StoryTextFiles/' + file, "r") as target_file:
         file_name_and_text[file] = target_file.read()
file_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'file_name', 0: 'text'}))

In [36]:
file_data.shape

(556, 2)

In [37]:
file_data.head()

Unnamed: 0,file_name,text
0,103.txt,A cloud rolled along merrily in the sky.The pe...
1,10.txt,On a cliff there lived a young vulture with hi...
2,100.txt,Have you met my Aaji? Everyone says that my Aa...
3,102.txt,“A perfect day for a walk!” thinks Mr. Centipe...
4,105.txt,Amma is visiting Nani. Tara and Mini are waiti...


In [0]:
import spacy 
from textstat.textstat import textstatistics, easy_word_set, legacy_round 

In [0]:
# Splits the text into sentences, using  
def break_sentences(text): 
    nlp = spacy.load('en') 
    doc = nlp(text) 
    return [sent.string.strip() for sent in doc]

In [0]:
# Returns Number of Words in the text 
def word_count(text): 
    sentences = break_sentences(text) 
    words = 0
    for sentence in sentences: 
        words += len([token for token in sentence]) 
    return words

In [0]:
# Returns the number of sentences in the text 
def sentence_count(text): 
    sentences = break_sentences(text) 
    return len(sentences)

In [0]:
# Returns average sentence length 
def avg_sentence_length(text): 
    words = word_count(text) 
    sentences = sentence_count(text)
    average_sentence_length=0
    if(sentences>0):
      average_sentence_length = float(words / sentences) 
    return average_sentence_length

In [0]:
# Textstat is a python package, to calculate statistics from  
# text to determine readability,  
# complexity and grade level of a particular corpus. 
def syllables_count(word): 
    return textstatistics().syllable_count(word)

In [0]:
# Returns the average number of syllables per 
# word in the text 
def avg_syllables_per_word(text): 
    syllable = syllables_count(text) 
    words = word_count(text) 
    ASPW=0
    if(words>0):
      ASPW = float(syllable) / float(words) 
    return legacy_round(ASPW, 1) 

In [0]:
# Return total Difficult Words in a text 
def difficult_words(text): 
  
    # Find all words in the text 
    words = [] 
    sentences = break_sentences(text) 
    for sentence in sentences: 
        words += [str(token) for token in sentence] 
  
    # difficult words are those with syllables >= 2 
    # easy_word_set is provide by Textstat as  
    # a list of common words 
    diff_words_set = set() 
      
    for word in words: 
        syllable_count = syllables_count(word) 
        if word not in easy_word_set and syllable_count >= 2: 
            diff_words_set.add(word) 
  
    return len(diff_words_set) 

In [0]:
# A word is polysyllablic if it has more than 3 syllables 
# this functions returns the number of all such words  
# present in the text 
def poly_syllable_count(text): 
    count = 0
    words = [] 
    sentences = break_sentences(text) 
    for sentence in sentences: 
        words += [token for token in sentence] 
      
  
    for word in words: 
        syllable_count = syllables_count(word) 
        if syllable_count >= 3: 
            count += 1
    return count

In [0]:
def dale_chall_readability_score(text): 
    """ 
        Implements Dale Challe Formula: 
        Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365 
        Here, 
            PDW = Percentage of difficult words. 
            ASL = Average sentence length 
    """
    per=0
    words= word_count(text)
    difficult_word=difficult_words(text)
    # Number of words not termed as difficult words 
    count = words -  difficult_word
    if words > 0: 
        # Percentage of words not on difficult word list 
        per = float(count) / float(words) * 100
      
    # diff_words stores percentage of difficult words 
    diff_words = 100 - per 
  
    raw_score = (0.1579 * diff_words) + (0.0496 * avg_sentence_length(text)) 
      
    # If Percentage of Difficult Words is greater than 5 %, then; 
    # Adjusted Score = Raw Score + 3.6365, 
    # otherwise Adjusted Score = Raw Score 
  
    if diff_words > 5:        
  
        raw_score += 3.6365
          
    return legacy_round(raw_score, 2)

In [0]:
file_data1=file_data;

In [0]:
#file_data['dale_chall_readability_score']=0
score=[]

In [0]:
for ind in file_data.index: 
  story=file_data['text'][ind]
  #file_data['dale_chall_readability_score'][ind]=dale_chall_readability_score(story)
  score = np.append(score, dale_chall_readability_score(story))

  #print("file_data['dale_chall_readability_score'][ind]",file_data['dale_chall_readability_score'][ind])

file_data['dale_chall_readability_score']=score;

In [51]:
file_data

Unnamed: 0,file_name,text,dale_chall_readability_score
0,103.txt,A cloud rolled along merrily in the sky.The pe...,0.18
1,10.txt,On a cliff there lived a young vulture with hi...,0.18
2,100.txt,Have you met my Aaji? Everyone says that my Aa...,0.17
3,102.txt,“A perfect day for a walk!” thinks Mr. Centipe...,0.17
4,105.txt,Amma is visiting Nani. Tara and Mini are waiti...,0.17
...,...,...,...
551,453.txt,"2One day, an ant went looking for food. She fo...",0.18
552,452.txt,TextTHE ENDSUPERCOW This story is entirely fic...,0.21
553,450.txt,I could not believe my eyes when I saw her for...,0.18
554,451.txt,There once was a boy who could run and jump an...,0.16
