In [2]:
import pandas as pd
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

In [3]:
df = pd.read_pickle('childrens_book_df.pkl')

In [4]:
df.head()

Unnamed: 0,Title,Author,Chapter,NLP_Text,Readable_Text
0,Prince Prigio,Andrew Lang,I,lcb chapter heading picture p1jpg rcb fairies ...,How the Fairies were not Invited to Court. Onc...
1,Prince Prigio,Andrew Lang,II,lcb chapter heading picture p9jpg rcb prince p...,"Prince Prigio and his Family. Well, the little..."
2,Prince Prigio,Andrew Lang,III,firedrake people like prigio dear papa king gr...,About the Firedrake. Of all the people who did...
3,Prince Prigio,Andrew Lang,IV,prince prigio deserted everybody meanwhile pri...,How Prince Prigio was Deserted by Everybody. M...
4,Prince Prigio,Andrew Lang,V,prince prigio found garret prince walked room ...,What Prince Prigio found in the Garret. The pr...


In [5]:
#column for readable text passage length, for counting relative frequencies of phonics
df["Readable_Passage_Len"] = df["Readable_Text"].str.len()

In [6]:
#initialize dict to store relevant letter combinations for sounds
phonics_dict = {
    "diphthongs": ["oo", "ou", "ow", "oi", "oy", "aw", "au", "ie", "igh", "ay", "ee"], 
    "ending blends":["mp", "ng", "nt", "nk", "nd", "sk", "st"], 
    "l blends": ["bl", "cl", "gl", "pl", "sl"], 
    "r blends": ["br", "cr", "dr", "fr", "gr", "pr", "tr"], 
    "s blends": ["sc", "sk", "sl", "sm", "sn", "sp", "st", "str", "scr", "spl"], 
    "digraphs": ["ch", "ph", "sh", "wh", "tch", "th", "gh"], 
    "soft c": ["ce", "ci", "cy"], 
    "soft g": ["ge", "dge", "gi", "gy"], 
    "silent letters": ["kn", "wr", "gn"]
}

In [7]:
#test frequency concept
sound = 'oo'

df[sound] = (df["Readable_Text"].str.count(sound))/df["Readable_Passage_Len"]

In [8]:
#get all dictionary keys into list to loop through
phonics = phonics_dict.values()
phonics_flat = [item for sublist in phonics for item in sublist]

In [47]:
phonics_flat

['oo',
 'ou',
 'ow',
 'oi',
 'oy',
 'aw',
 'au',
 'ie',
 'igh',
 'ay',
 'ee',
 'mp',
 'ng',
 'nt',
 'nk',
 'nd',
 'sk',
 'st',
 'bl',
 'cl',
 'gl',
 'pl',
 'sl',
 'br',
 'cr',
 'dr',
 'fr',
 'gr',
 'pr',
 'tr',
 'sc',
 'sk',
 'sl',
 'sm',
 'sn',
 'sp',
 'st',
 'str',
 'scr',
 'spl',
 'ch',
 'ph',
 'sh',
 'wh',
 'tch',
 'th',
 'gh',
 'ce',
 'ci',
 'cy',
 'ge',
 'dge',
 'gi',
 'gy',
 'kn',
 'wr',
 'gn']

In [9]:
#create columns for each sound
for sound in phonics_flat:
    df[sound] = (df["Readable_Text"].str.count(sound))/df["Readable_Passage_Len"]

In [10]:
df

Unnamed: 0,Title,Author,Chapter,NLP_Text,Readable_Text,Readable_Passage_Len,oo,ou,ow,oi,...,ce,ci,cy,ge,dge,gi,gy,kn,wr,gn
0,Prince Prigio,Andrew Lang,I,lcb chapter heading picture p1jpg rcb fairies ...,How the Fairies were not Invited to Court. Onc...,5500,0.002727,0.008182,0.001818,0.000364,...,0.003636,0.000000,0.000000,0.000364,0.000000,0.000545,0.000000,0.000545,0.000000,0.000545
1,Prince Prigio,Andrew Lang,II,lcb chapter heading picture p9jpg rcb prince p...,"Prince Prigio and his Family. Well, the little...",3210,0.004050,0.010280,0.004984,0.000000,...,0.005296,0.000623,0.000000,0.001558,0.000000,0.001869,0.000000,0.000623,0.000312,0.000312
2,Prince Prigio,Andrew Lang,III,firedrake people like prigio dear papa king gr...,About the Firedrake. Of all the people who did...,8008,0.001873,0.012488,0.002997,0.000874,...,0.003621,0.000125,0.000125,0.002498,0.000000,0.002872,0.000000,0.000500,0.000250,0.000250
3,Prince Prigio,Andrew Lang,IV,prince prigio deserted everybody meanwhile pri...,How Prince Prigio was Deserted by Everybody. M...,4607,0.001954,0.008465,0.004124,0.000434,...,0.006295,0.000434,0.000000,0.000868,0.000217,0.002388,0.000217,0.000868,0.000217,0.000217
4,Prince Prigio,Andrew Lang,V,prince prigio found garret prince walked room ...,What Prince Prigio found in the Garret. The pr...,1412,0.007790,0.005666,0.003541,0.000708,...,0.004249,0.000000,0.000708,0.000000,0.000000,0.000708,0.000000,0.000000,0.000708,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXIX,making old house home always make love neer fo...,Making Over An Old House A home is always what...,3696,0.009740,0.012175,0.002976,0.001082,...,0.002165,0.000812,0.000000,0.001894,0.000271,0.000812,0.000000,0.000271,0.000000,0.000000
862,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXX,whitefoots enjoy new home home ever mean poor ...,The Whitefoots Enjoy Their New Home No home is...,3307,0.008467,0.007560,0.004233,0.000000,...,0.001210,0.000000,0.000000,0.002419,0.000000,0.000302,0.000000,0.000302,0.000000,0.000000
863,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXXI,whitefoot hurt hurts hardest bear whitefoot wh...,Whitefoot Is Hurt The hurts that hardest are t...,2761,0.008693,0.011590,0.002173,0.000000,...,0.002173,0.000000,0.000000,0.001449,0.000000,0.000724,0.000000,0.000362,0.000000,0.000000
864,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXXII,surprise surprises sometimes great tempted bel...,The Surprise Surprises sometimes are so great ...,3746,0.011479,0.015750,0.003203,0.000534,...,0.001068,0.000267,0.000000,0.000801,0.000000,0.000534,0.000000,0.000534,0.000000,0.000000


In [11]:
#test
df["Readable_Text"][df["ou"] == df["ou"].max()].values

array(["BLACKY THE CROW IS ALL PITY People who think that they are fooling others very often discover that they have been fooling themselves. Bowser the Hound. To have seen and heard Blacky the Crow as he talked to Reddy Fox, you would have thought that there was nothing under the sun in his heart or mind but pity. Yes, Sir, '' said he, I certainly would be tempted to show you where those fat hens are if you were not too weak. I just ca n't bear to see an old friend starve. It is too bad that those fat hens are so far away. I feel sure that one of them would make you quite yourself again. '' Do n't -- do n't talk about them, '' said Reddy feebly. If I could have just one fat hen that is all I would ask. Are they so very far from here? ''Blacky nodded his head vigorously. Yes, '' said he, they are a long way from here. They are such a long way that I 'm afraid you are too weak to make the journey. If you were quite yourself you could do it nicely, but for one in your condition it is, I 

In [66]:
df.to_pickle('/Users/user/desktop/metis/projects/metis-project-5/childrens_book_df_with_sounds.pkl')

In [12]:
#diphthong column
df["diphthong"] = df[["oo", "ou", "ow", "oi", "oy", "aw", "au", "ie", "igh", "ay", "ee"]].sum(axis=1)

In [13]:
df

Unnamed: 0,Title,Author,Chapter,NLP_Text,Readable_Text,Readable_Passage_Len,oo,ou,ow,oi,...,ci,cy,ge,dge,gi,gy,kn,wr,gn,diphthong
0,Prince Prigio,Andrew Lang,I,lcb chapter heading picture p1jpg rcb fairies ...,How the Fairies were not Invited to Court. Onc...,5500,0.002727,0.008182,0.001818,0.000364,...,0.000000,0.000000,0.000364,0.000000,0.000545,0.000000,0.000545,0.000000,0.000545,0.026364
1,Prince Prigio,Andrew Lang,II,lcb chapter heading picture p9jpg rcb prince p...,"Prince Prigio and his Family. Well, the little...",3210,0.004050,0.010280,0.004984,0.000000,...,0.000623,0.000000,0.001558,0.000000,0.001869,0.000000,0.000623,0.000312,0.000312,0.026168
2,Prince Prigio,Andrew Lang,III,firedrake people like prigio dear papa king gr...,About the Firedrake. Of all the people who did...,8008,0.001873,0.012488,0.002997,0.000874,...,0.000125,0.000125,0.002498,0.000000,0.002872,0.000000,0.000500,0.000250,0.000250,0.026099
3,Prince Prigio,Andrew Lang,IV,prince prigio deserted everybody meanwhile pri...,How Prince Prigio was Deserted by Everybody. M...,4607,0.001954,0.008465,0.004124,0.000434,...,0.000434,0.000000,0.000868,0.000217,0.002388,0.000217,0.000868,0.000217,0.000217,0.026264
4,Prince Prigio,Andrew Lang,V,prince prigio found garret prince walked room ...,What Prince Prigio found in the Garret. The pr...,1412,0.007790,0.005666,0.003541,0.000708,...,0.000000,0.000708,0.000000,0.000000,0.000708,0.000000,0.000000,0.000708,0.000000,0.021955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXIX,making old house home always make love neer fo...,Making Over An Old House A home is always what...,3696,0.009740,0.012175,0.002976,0.001082,...,0.000812,0.000000,0.001894,0.000271,0.000812,0.000000,0.000271,0.000000,0.000000,0.034903
862,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXX,whitefoots enjoy new home home ever mean poor ...,The Whitefoots Enjoy Their New Home No home is...,3307,0.008467,0.007560,0.004233,0.000000,...,0.000000,0.000000,0.002419,0.000000,0.000302,0.000000,0.000302,0.000000,0.000000,0.031751
863,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXXI,whitefoot hurt hurts hardest bear whitefoot wh...,Whitefoot Is Hurt The hurts that hardest are t...,2761,0.008693,0.011590,0.002173,0.000000,...,0.000000,0.000000,0.001449,0.000000,0.000724,0.000000,0.000362,0.000000,0.000000,0.034046
864,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXXII,surprise surprises sometimes great tempted bel...,The Surprise Surprises sometimes are so great ...,3746,0.011479,0.015750,0.003203,0.000534,...,0.000267,0.000000,0.000801,0.000000,0.000534,0.000000,0.000534,0.000000,0.000000,0.040310


In [39]:
df_long = df[df.Readable_Passage_Len > 1000]

In [40]:
df_long

Unnamed: 0,Title,Author,Chapter,NLP_Text,Readable_Text,Readable_Passage_Len,oo,ou,ow,oi,...,ci,cy,ge,dge,gi,gy,kn,wr,gn,diphthong
0,Prince Prigio,Andrew Lang,I,lcb chapter heading picture p1jpg rcb fairies ...,How the Fairies were not Invited to Court. Onc...,5500,0.002727,0.008182,0.001818,0.000364,...,0.000000,0.000000,0.000364,0.000000,0.000545,0.000000,0.000545,0.000000,0.000545,0.026364
1,Prince Prigio,Andrew Lang,II,lcb chapter heading picture p9jpg rcb prince p...,"Prince Prigio and his Family. Well, the little...",3210,0.004050,0.010280,0.004984,0.000000,...,0.000623,0.000000,0.001558,0.000000,0.001869,0.000000,0.000623,0.000312,0.000312,0.026168
2,Prince Prigio,Andrew Lang,III,firedrake people like prigio dear papa king gr...,About the Firedrake. Of all the people who did...,8008,0.001873,0.012488,0.002997,0.000874,...,0.000125,0.000125,0.002498,0.000000,0.002872,0.000000,0.000500,0.000250,0.000250,0.026099
3,Prince Prigio,Andrew Lang,IV,prince prigio deserted everybody meanwhile pri...,How Prince Prigio was Deserted by Everybody. M...,4607,0.001954,0.008465,0.004124,0.000434,...,0.000434,0.000000,0.000868,0.000217,0.002388,0.000217,0.000868,0.000217,0.000217,0.026264
4,Prince Prigio,Andrew Lang,V,prince prigio found garret prince walked room ...,What Prince Prigio found in the Garret. The pr...,1412,0.007790,0.005666,0.003541,0.000708,...,0.000000,0.000708,0.000000,0.000000,0.000708,0.000000,0.000000,0.000708,0.000000,0.021955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXIX,making old house home always make love neer fo...,Making Over An Old House A home is always what...,3696,0.009740,0.012175,0.002976,0.001082,...,0.000812,0.000000,0.001894,0.000271,0.000812,0.000000,0.000271,0.000000,0.000000,0.034903
862,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXX,whitefoots enjoy new home home ever mean poor ...,The Whitefoots Enjoy Their New Home No home is...,3307,0.008467,0.007560,0.004233,0.000000,...,0.000000,0.000000,0.002419,0.000000,0.000302,0.000000,0.000302,0.000000,0.000000,0.031751
863,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXXI,whitefoot hurt hurts hardest bear whitefoot wh...,Whitefoot Is Hurt The hurts that hardest are t...,2761,0.008693,0.011590,0.002173,0.000000,...,0.000000,0.000000,0.001449,0.000000,0.000724,0.000000,0.000362,0.000000,0.000000,0.034046
864,Whitefoot the Wood Mouse,Thornton Waldo Burgess,XXXII,surprise surprises sometimes great tempted bel...,The Surprise Surprises sometimes are so great ...,3746,0.011479,0.015750,0.003203,0.000534,...,0.000267,0.000000,0.000801,0.000000,0.000534,0.000000,0.000534,0.000000,0.000000,0.040310


In [45]:
df_long["Readable_Text"][df_long["diphthong"] == df_long["diphthong"].min()].values

array(["SOME ONE FOOLS OLD JED THUMPER You can not judge a person 's temper by his size. There is more meanness in the head of a Weasel than in the whole of a Bear. Peter Rabbit. Old Jed Thumper sat in his bull-briar castle in the middle of the Old Pasture, scowling fiercely and muttering to himself. He was very angry, was Old Jed Thumper. He was so angry that presently he stopped muttering and began to chew rapidly on nothing at all but his temper, which is a way angry Rabbits have. The more he chewed his temper, the angrier he grew. He was big and stout and strong and gray. He had lived so long in the Old Pasture that he felt that it belonged to him and that no other Rabbit had any right there unless he said so. Yet here was a strange Rabbit who had had the impudence to come up from the Green Meadows and refused to be driven away. Such impudence!Of course it was Peter Rabbit of whom Old Jed Thumper was thinking. It was two days since he had caught a glimpse of Peter, but he knew that

In [53]:
df.iloc[5:6].Readable_Text.values

array(["What Happened to Prince Prigio in Town. By this time the prince was very hungry. The town was just three miles off; but he had such a royal appetite, that he did not like to waste it on bad cookery, and the people of the royal town were bad cooks. I wish I were in The Bear, ' at Gluckstein, '' said he to himself; for he remembered that there was a very good cook there. But, then, the town was twenty-one leagues away -- sixty-three long miles!No sooner had the prince said this, and taken just three steps, than he found himself at the door of the Bear Inn '' at Gluckstein! This is the most extraordinary dream, '' said he to himself; for he was far too clever, of course, to believe in seven-league boots. Yet he had a pair on at that very moment, and it was they which had carried him in three strides from the palace to Gluckstein!The truth is, that the prince, in looking about the palace for clothes, had found his way into that very old lumber-room where the magical gifts of the fa