# NLP Final Project Code
**Noelle Brown  
MSDS 7337 Section 402**

<a id='Top'></a>

# Contents
### [Retrieving Texts](#Retrieving Texts)
### [Cleaning Texts](#Cleaning Texts)
### [TF-IDF](#TF-IDF)
#### [TF-IDF Readability](#TF-IDF Readability)
#### [TF-IDF Visualizations](#TF-IDF Visualizations)
### [Bag of Words](#Bag of Words)
#### [Bag of Words Readability](#Bag of Words Readability)
#### [Bag of Words Visualizations](#Bag of Words Visualizations)

In [1]:
# Necessary imports
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import nltk; print("nltk", nltk.__version__)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.corpus import sentiwordnet as swn
import bs4; print("BS4", bs4.__version__)
from bs4 import BeautifulSoup, SoupStrainer
import requests; print("requests", requests.__version__)
import urllib
from urllib.request import urlopen
import re; print("re", re.__version__)
import os; print(os.environ['CONDA_DEFAULT_ENV'])
import numpy as np; print("numpy", np.__version__)
import scipy; print("scipy", scipy.__version__)
from scipy.stats import itemfreq
from scipy.cluster.hierarchy import ward, dendrogram
import copy
import pandas as pd; print("pandas", pd.__version__)
import sklearn; print("sklearn", sklearn.__version__)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import random
import gutenberg
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from gutenberg.query import get_etexts, get_metadata
import pyLDAvis; print("pyLDAvis", pyLDAvis.__version__)
from pyLDAvis import sklearn
import pyLDAvis.gensim
import gensim; print("gensim", gensim.__version__)
from gensim import corpora
import pickle
import textstat

Darwin-17.5.0-x86_64-i386-64bit
Python 3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
nltk 3.3
BS4 4.6.0
requests 2.18.4
re 2.2.1
base
numpy 1.14.3
scipy 1.1.0
pandas 0.23.0
sklearn 0.19.1
pyLDAvis 2.1.2
gensim 3.6.0


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


<a id='Retrieving Texts'></a>

### Retrieving Texts

In [2]:
# Read in 100+ texts from Gutenberg readers
# Code from HW2

# main URL for Gutenberg readers
url = "http://www.gutenberg.org/wiki/Children's_Instructional_Books_(Bookshelf)"
r=requests.get(url)
soup=BeautifulSoup(r.text, "lxml")

# get just links to books
book_links = soup.find_all(class_="extiw")
# this code extracts the title id from the links
num_list = []
for book in book_links:
    num = re.findall(r'\d+', str(book)) #get title number
    num_list.append(num[0])

# From https://pypi.org/project/Gutenberg/ to strip headers
text_list = []
for n in num_list:
    n = int(n)
    text = strip_headers(load_etext(n)).strip()
    text_list.append(text)

In [3]:
# Get the titles
title_containers = soup.find_all(class_='extiw')
title_list = []

# extracts just the titles of movies
for link in title_containers:
    title = link.text
    title_list.append(title)

*[Return to top](#Top)*

<a id='Cleaning Texts'></a>

### Cleaning Texts
**The headers were already stripped above, so now I will do as much as I can to remove unnecessary portions of the texts. This is difficult because the texts do not have a consistent format and each have different parts that would ideally need to be removed. The text will not be perfectly clean, but it will be better than what we started with.**

In [4]:
clean_text_list = []
for text in text_list:
    # Remove illustration text - always occurs between []
    # https://stackoverflow.com/questions/14596884/remove-text-between-and-in-python/14598135
    text = re.sub("[\(\[].*?]", "", text)
    text = text.lower()
    text = text.split('\n')
    # Remove lines with "copyright, "mcguffey's, "eclectic," "illustration," "online"
    text = [line for line in text if 'copyright' not in line]
    text = [line for line in text if 'mcguffey' not in line]
    text = [line for line in text if 'eclectic' not in line]
    text = [line for line in text if 'illustration' not in line]
    text = [line for line in text if 'online' not in line]
    # Remove blank lines
    text = [line for line in text if line != '']
    # Remove numbers
    text = re.sub(r'[0-9]+', '', str(text))
    clean_text_list.append(text)

  text = re.sub("[\(\[].*?]", "", text)


In [5]:
# from Sarkar pg. 287 & previous chapters
# normalize & extract features
CONTRACTION_MAP = {"ain't": "is not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because","could've": "could have",
                   "couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not",
                   "hadn't've": "had not have","hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
                   "he'll've": "he he will have","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
                   "I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","i'd": "i would",
                   "i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have","isn't": "is not","it'd": "it would",
                   "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not",
                   "mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have",
                   "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
                   "she'll've": "she will have","she's": "she is","should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have",
                   "so's": "so as","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there would","there'd've": "there would have",
                   "there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are",
                   "they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would","we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                   "we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
                   "when's": "when is","when've": "when have","where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have",
                   "who's": "who is","who've": "who have","why's": "why is","why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
                   "would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have",
                   "y'all're": "you all are","y'all've": "you all have","you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
                   "you're": "you are","you've": "you have"}

# Stop words list - first is from NLTK, then add custom words
STOP_WORDS = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", 
              "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 
              'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
              'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 
              'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
              'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 
              'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 
              'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 
              'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 
              'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 
              't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 
              've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', 
              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', 
              "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
              "weren't", 'won', "won't", 'wouldn', "wouldn't", 'went', 'de', 'mr', 'came', 'la', 'took', 'unless', 'el',
              'vii','v','i','ii','iii','iv','vi','viii','ix','x', 'de', 'en', 'le', 'à', 'et', 'du', '“', '”', 'f', 'al',
             'un','une','one','e','que','e','new','said','two','would','ce','il','est','se','au'}

def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                            flags = re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expand_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expand_contraction[1:]
        return expanded_contraction
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

def remove_stop_words(text):
    return [word for word in text if word not in STOP_WORDS]
    
def lemmatize(words):
    lmtzr = WordNetLemmatizer()
    lem_sentence=[]
    for word in words:
        lem_sentence.append(lmtzr.lemmatize(word))
        lem_sentence.append(" ")
    return "".join(lem_sentence)

def no_punc(s):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in s if ch not in exclude) 

In [6]:
norm_text = []
for text in clean_text_list:
    # Remove stop words, expand contractions, lemmatize, remove punctuation, tokenize
    text = str(text).rstrip('\n')
    text = no_punc(text)
    text = expand_contractions(str(text), CONTRACTION_MAP)
    text = remove_stop_words(word_tokenize(str(text)))
    text = lemmatize(text)
    norm_text.append(text)

In [7]:
# Build feature matrix to get tf-idf values from Sarkar pg. 270
def build_feature_matrix(documents, feature_type='frequency', ngram_range=(1, 1), min_df=0.0, max_df=1.0):
    feature_type = feature_type.lower().strip()
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df, max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df, max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values:'binary', 'frequency','tfidf'")
    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    return vectorizer, feature_matrix

In [8]:
vectorizer, feature_matrix = build_feature_matrix(norm_text, feature_type='tfidf',
                                                  min_df=0.24, max_df=0.85, ngram_range=(1, 2))

*[Return to top](#Top)*

### Topic Modeling
**I will perform LDA since the texts are long**

<a id='TF-IDF'></a>

#### TF-IDF

In [9]:
# get topics with their terms and weights
def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1]) for row in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index]) for wt, index in zip(weights,sorted_indices)])
    sorted_terms = np.array([list(feature_names[row]) for row in sorted_indices])
    topics = [np.vstack((terms.T, term_weights.T)).T for terms, term_weights in zip(sorted_terms, sorted_weights)]
    return topics

# print all the topics from a corpus
def print_topics_udf(topics, total_topics=1,weight_threshold=0.0001,display_weights=False,num_terms=None):
    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt)) for term, wt in topic]
        topic = [(word, round(wt,2)) for word, wt in topic if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print(topic[:num_terms]) if num_terms else topic
        else:
            print('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms]) if num_terms else tw
        print

In [10]:
# From Sarkar pg. 244
total_topics = 5
lda = LatentDirichletAllocation(n_components=total_topics, max_iter=100, learning_method="online", learning_offset=50., random_state=42)
lda.fit(feature_matrix)

feature_names = vectorizer.get_feature_names()
weights = lda.components_
topics = get_topics_terms_weights(weights, feature_names)

In [11]:
print_topics_udf(topics=topics, total_topics=total_topics, num_terms=8, display_weights=True)

Topic #1 with weights
[('faroff', 0.22), ('amazement', 0.22), ('row', 0.22), ('hand foot', 0.22), ('mass', 0.22), ('magazine', 0.22), ('else', 0.22), ('swift', 0.22)]
Topic #2 with weights
[('king', 5.28), ('animal', 5.08), ('dog', 4.69), ('father', 4.66), ('lesson', 4.27), ('yes', 3.93), ('friend', 3.69), ('thy', 3.63)]
Topic #3 with weights
[('channel', 0.22), ('minute', 0.22), ('religion', 0.22), ('throat', 0.22), ('joining', 0.22), ('ride', 0.22), ('indignant', 0.22), ('time see', 0.22)]
Topic #4 with weights
[('lovely', 0.22), ('acorn', 0.22), ('average', 0.22), ('tread', 0.22), ('profound', 0.22), ('slice', 0.22), ('lecture', 0.22), ('absorbed', 0.22)]
Topic #5 with weights
[('reading', 0.22), ('organ', 0.22), ('chapter', 0.22), ('may made', 0.22), ('flowing', 0.22), ('charity', 0.22), ('astonished', 0.22), ('come back', 0.22)]


**Dale-Chall Readability Score of texts**  
from: https://pypi.org/project/textstat/  

|Score | Understood by 
|-------------|-----------------------------------------------
|4.9 or lower | average 4th-grade student or lower 
|5.0–5.9 | average 5th or 6th-grade student 
|6.0–6.9 | average 7th or 8th-grade student 
|7.0–7.9 | average 9th or 10th-grade student 
|8.0–8.9 | average 11th or 12th-grade student 
|9.0–9.9 | average 13th to 15th-grade (college) student 

<a id='TF-IDF Readability'></a>

In [12]:
textdf_tfidf = pd.DataFrame(columns = ('Title','Topic Probability','Readability Group','Topic Group'))

In [13]:
for i in range(0, len(norm_text)):
    title = title_list[i]
    print("Text Title:", title)
    topic_prob = lda.transform(feature_matrix[i])
    topic_prob = str(topic_prob).split(' ')
    max_prob = 0
    for j in range(0, len(topic_prob)):
        if topic_prob[j] != "":
            topic_prob[j] = topic_prob[j].replace("[","").replace("]","")
            if topic_prob[j] != "":
                if float(topic_prob[j]) > float(max_prob):
                    max_topic_num = j
                    max_prob = topic_prob[j]
    print("Topic Group:", max_topic_num, "Topic Probability:", max_prob)
    read_score = textstat.dale_chall_readability_score(text_list[i])
    print("Readability Score:", read_score)
    if (read_score <= 4.9):
        print("This text can be read by an average 4th-grader or lower")
        read_type = 0
    elif (read_score <= 5.9):
        print("This text can be read by an average 5th or 6th grader")
        read_type = 1
    elif (read_score <= 6.9):
        print("This text can be read by an average 7th or 8th grader")
        read_type = 2
    elif (read_score <= 7.9):
        print("This text can be read by an average 9th or 10th grader")
        read_type = 3
    elif (read_score <= 8.9):
        print("This text can be read by an average 11th or 12th grader")
        read_type = 4
    elif (read_score <= 10):
        print("This text can be read by an average college student")
        read_type = 5
    textdf_tfidf = textdf_tfidf.append(pd.Series([title, max_prob, read_type, max_topic_num], index=textdf_tfidf.columns), ignore_index=True)
    print('='*60)

Text Title: A Primary Reader: 
Old-time Stories, Fairy Tales and Myths Retold by Children
Topic Group: 1 Topic Probability: 0.95122016
Readability Score: 1.64
This text can be read by an average 4th-grader or lower
Text Title: The Bird-Woman of the Lewis and Clark Expedition
Topic Group: 1 Topic Probability: 0.93741301
Readability Score: 0.85
This text can be read by an average 4th-grader or lower
Text Title: Dr. Scudder's Tales for Little Readers, About the Heathen.
Topic Group: 1 Topic Probability: 0.97086852
Readability Score: 5.68
This text can be read by an average 5th or 6th grader
Text Title: The Louisa Alcott Reader: a Supplementary Reader for the Fourth Year of School
Topic Group: 1 Topic Probability: 0.97267304
Readability Score: 2.78
This text can be read by an average 4th-grader or lower
Text Title: Boy Blue and his friends, School ed.
Topic Group: 1 Topic Probability: 0.94469905
Readability Score: 6.37
This text can be read by an average 7th or 8th grader
Text Title: The B

Readability Score: 7.19
This text can be read by an average 9th or 10th grader
Text Title: A First Spanish Reader
Topic Group: 1 Topic Probability: 0.95526469
Readability Score: 8.21
This text can be read by an average 11th or 12th grader
Text Title: An Elementary Spanish Reader
Topic Group: 1 Topic Probability: 0.87918899
Readability Score: 8.92
This text can be read by an average college student
Text Title: First Italian Readings
Topic Group: 1 Topic Probability: 0.94676602
Readability Score: 10.52
Text Title: Contes et historiettes à l'usage des jeunes enfants
Qui commencent à savoir lire
Topic Group: 1 Topic Probability: 0.82896306
Readability Score: 7.01
This text can be read by an average 9th or 10th grader
Text Title: The Flag of My Country. Shikéyah Bidah Na'at'a'í
Navajo New World Readers 2
Topic Group: 1 Topic Probability: 0.91165643
Readability Score: 8.37
This text can be read by an average 11th or 12th grader
Text Title: A History of the McGuffey Readers
Topic Group: 1 Top

Readability Score: 1.99
This text can be read by an average 4th-grader or lower
Text Title: A School History of the United States
Topic Group: 1 Topic Probability: 0.96612278
Readability Score: 1.87
This text can be read by an average 4th-grader or lower
Text Title: The Story of Manhattan
Topic Group: 1 Topic Probability: 0.97995635
Readability Score: 6.67
This text can be read by an average 7th or 8th grader
Text Title: Young Folks' History of Rome
Topic Group: 1 Topic Probability: 0.96371057
Readability Score: 6.52
This text can be read by an average 7th or 8th grader
Text Title: Denmark
Topic Group: 1 Topic Probability: 0.97505692
Readability Score: 6.63
This text can be read by an average 7th or 8th grader
Text Title: The Land of the Long Night
Topic Group: 1 Topic Probability: 0.97314541
Readability Score: 1.93
This text can be read by an average 4th-grader or lower
Text Title: Little Journey to Puerto Rico : for Intermediate and Upper Grades
Topic Group: 1 Topic Probability: 0.97

*[Return to top](#Top)*

<a id='TF-IDF Visualizations'></a>

#### TF-IDF Visualizations

In [14]:
pyLDAvis.enable_notebook()
p = pyLDAvis.sklearn.prepare(lda, feature_matrix, vectorizer)
pyLDAvis.save_html(p, 'lda.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


Source: https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization  
"The left panel, labeld Intertopic Distance Map, circles represent different topics and the distance between them. Similar topics appear closer and the dissimilar topics farther. The relative size of a topic's circle in the plot corresponds to the relative frequency of the topic in the corpus. An individual topic may be selected for closer scrutiny by clicking on its circle, or entering its number in the "selected topic" box in the upper-left.

The right panel, include the bar chart of the top 30 terms. When no topic is selected in the plot on the left, the bar chart shows the top-30 most "salient" terms in the corpus. A term's saliency is a measure of both how frequent the term is in the corpus and how "distinctive" it is in distinguishing between different topics. Selecting each topic on the right, modifies the bar chart to show the "relevant" terms for the selected topic. Relevence is defined as in footer 2 and can be tuned by parameter  λ , smaller  λ  gives higher weight to the term's distinctiveness while larger  λ s corresponds to probablity of the term occurance per topics."

In [15]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='nobrown', api_key='FzKDm99tYipTNdx4LFIx')

trace = go.Scatter(
    y = textdf_tfidf['Topic Group'],
    x = textdf_tfidf['Topic Probability'],
    text=textdf_tfidf['Title'],
    mode = 'markers',  
    marker=dict(size=10, color = ~textdf_tfidf['Readability Group'],  
    colorscale='Viridis', 
    colorbar=dict(title = "Readability Group")
))
layout = go.Layout(
    title="Topic Group Probability by Readability (TF-IDF)",
    yaxis=dict(title='Topic Group'),
    autosize=True,
    #width=900,
    #height=500,
    xaxis=dict(title='Topic Probability')
)
data = [trace]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf')
py.plot(fig, filename = 'tfidf', auto_open=True)

'https://plot.ly/~nobrown/26'

In [16]:
_11 = _12 = _13 = _14 = _15 = _16 = _21 = _22 = _23 = _24 = _25 = _26 = _31 = _32 = _33 = _34 = _35 = _36 = _41 = _42 = _43 = _44 = _45 = _46 = _51 = _52 = _53 = _54 = _55 = _56 = 0
for i in range(0, len(textdf_tfidf)):
    # topic 0
    if textdf_tfidf["Topic Group"][i] == 0 and textdf_tfidf["Readability Group"][i] == 0:
        _11 += 1
    if textdf_tfidf["Topic Group"][i] == 0 and textdf_tfidf["Readability Group"][i] == 1:
        _12 += 1
    if textdf_tfidf["Topic Group"][i] == 0 and textdf_tfidf["Readability Group"][i] == 2:
        _13 += 1
    if textdf_tfidf["Topic Group"][i] == 0 and textdf_tfidf["Readability Group"][i] == 3:
        _14 += 1
    if textdf_tfidf["Topic Group"][i] == 0 and textdf_tfidf["Readability Group"][i] == 4:
        _15 += 1
    if textdf_tfidf["Topic Group"][i] == 0 and textdf_tfidf["Readability Group"][i] == 5:
        _16 += 1

    # Topic 1
    if textdf_tfidf["Topic Group"][i] == 1 and textdf_tfidf["Readability Group"][i] == 0:
        _21 += 1
    if textdf_tfidf["Topic Group"][i] == 1 and textdf_tfidf["Readability Group"][i] == 1:
        _22 += 1
    if textdf_tfidf["Topic Group"][i] == 1 and textdf_tfidf["Readability Group"][i] == 2:
        _23 += 1
    if textdf_tfidf["Topic Group"][i] == 1 and textdf_tfidf["Readability Group"][i] == 3:
        _24 += 1
    if textdf_tfidf["Topic Group"][i] == 1 and textdf_tfidf["Readability Group"][i] == 4:
        _25 += 1
    if textdf_tfidf["Topic Group"][i] == 1 and textdf_tfidf["Readability Group"][i] == 5:
        _26 += 1

    # Topic 2        
    if textdf_tfidf["Topic Group"][i] == 2 and textdf_tfidf["Readability Group"][i] == 0:
        _31 += 1
    if textdf_tfidf["Topic Group"][i] == 2 and textdf_tfidf["Readability Group"][i] == 1:
        _32 += 1
    if textdf_tfidf["Topic Group"][i] == 2 and textdf_tfidf["Readability Group"][i] == 2:
        _33 += 1
    if textdf_tfidf["Topic Group"][i] == 2 and textdf_tfidf["Readability Group"][i] == 3:
        _34 += 1
    if textdf_tfidf["Topic Group"][i] == 2 and textdf_tfidf["Readability Group"][i] == 4:
        _35 += 1
    if textdf_tfidf["Topic Group"][i] == 2 and textdf_tfidf["Readability Group"][i] == 5:
        _36 += 1
       
    # Topic 3
    if textdf_tfidf["Topic Group"][i] == 3 and textdf_tfidf["Readability Group"][i] == 0:
        _41 += 1
    if textdf_tfidf["Topic Group"][i] == 3 and textdf_tfidf["Readability Group"][i] == 1:
        _42 += 1
    if textdf_tfidf["Topic Group"][i] == 3 and textdf_tfidf["Readability Group"][i] == 2:
        _43 += 1
    if textdf_tfidf["Topic Group"][i] == 3 and textdf_tfidf["Readability Group"][i] == 3:
        _44 += 1
    if textdf_tfidf["Topic Group"][i] == 3 and textdf_tfidf["Readability Group"][i] == 4:
        _45 += 1
    if textdf_tfidf["Topic Group"][i] == 3 and textdf_tfidf["Readability Group"][i] == 5:
        _46 += 1
    
    # Topic 4
    if textdf_tfidf["Topic Group"][i] == 4 and textdf_tfidf["Readability Group"][i] == 0:
        _51 += 1
    if textdf_tfidf["Topic Group"][i] == 4 and textdf_tfidf["Readability Group"][i] == 1:
        _52 += 1
    if textdf_tfidf["Topic Group"][i] == 4 and textdf_tfidf["Readability Group"][i] == 2:
        _53 += 1
    if textdf_tfidf["Topic Group"][i] == 4 and textdf_tfidf["Readability Group"][i] == 3:
        _54 += 1
    if textdf_tfidf["Topic Group"][i] == 4 and textdf_tfidf["Readability Group"][i] == 4:
        _55 += 1
    if textdf_tfidf["Topic Group"][i] == 4 and textdf_tfidf["Readability Group"][i] == 5:
        _56 += 1

In [17]:
trace = go.Heatmap(z=[[_11,_12,_13,_14,_15,_16],[_21,_22,_23,_24,_25,_26],[_31,_32,_33,_34,_35,_36],[_41,_42,_43,_44,_45,_46],[_51,_52,_53,_54,_55,_56]],
                   x=['0','1','2','3','4','5'], # Readability
                   y=['0,','1','2','3','4'], # Topic
                  colorscale = "Viridis") 

layout = go.Layout(
    title = "Number of Texts in Each Topic Group By Readability Group (TF-IDF)",
    xaxis = dict(title='Readability Group'), 
    yaxis = dict(title='Topic Group'), 
    autosize=True
    #width=900, height=500
)

figure = dict(data=[trace],layout=layout)
py.iplot(figure, filename='heatmaptfidf')
py.plot(figure, filename = 'heatmaptfidf', auto_open=True)

'https://plot.ly/~nobrown/28'

*[Return to top](#Top)*

<a id='Bag of Words'></a>

**Bag of Words**

In [18]:
norm_text2 = []
for text in norm_text:
    text = word_tokenize(text)
    norm_text2.append(text)

In [21]:
# https://datascienceplus.com/topic-modeling-in-python-with-nltk-and-gensim/
dictionary = corpora.Dictionary(norm_text2)
corpus = [dictionary.doc2bow(text) for text in norm_text2]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [22]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.006*"little" + 0.004*"day" + 0.004*"like" + 0.004*"time"')
(1, '0.005*"great" + 0.005*"state" + 0.003*"time" + 0.003*"made"')
(2, '0.010*"le" + 0.006*"de" + 0.006*"qui" + 0.005*"dans"')
(3, '0.009*"het" + 0.009*"een" + 0.008*"van" + 0.006*"food"')
(4, '0.007*"make" + 0.007*"end" + 0.006*"made" + 0.006*"piece"')


<a id='Bag of Words Readability'></a>

In [23]:
textdf = pd.DataFrame(columns = ('Title','Topic Probability','Readability Group'))

In [24]:
for i in range(0, len(norm_text2)):
    new_doc = norm_text2[i]
    new_doc_bow = dictionary.doc2bow(new_doc)
    title = title_list[i]
    print("Text Title:", title)
    topic_prob = ldamodel.get_document_topics(new_doc_bow, minimum_probability=0.5)
    print("Topic Probability:", topic_prob)
    read_score = textstat.dale_chall_readability_score(text_list[i])
    print("Readability Score:", read_score)
    if (read_score <= 4.9):
        print("This text can be read by an average 4th-grader or lower")
        read_type = 0
    elif (read_score <= 5.9):
        print("This text can be read by an average 5th or 6th grader")
        read_type = 1
    elif (read_score <= 6.9):
        print("This text can be read by an average 7th or 8th grader")
        read_type = 2
    elif (read_score <= 7.9):
        print("This text can be read by an average 9th or 10th grader")
        read_type = 3
    elif (read_score <= 8.9):
        print("This text can be read by an average 11th or 12th grader")
        read_type = 4
    elif (read_score <= 10):
        print("This text can be read by an average college student")
        read_type = 5
    textdf = textdf.append(pd.Series([title, topic_prob, read_type], index=textdf.columns), ignore_index=True)
    print('='*60)

Text Title: A Primary Reader: 
Old-time Stories, Fairy Tales and Myths Retold by Children
Topic Probability: [(0, 0.99967164)]
Readability Score: 1.64
This text can be read by an average 4th-grader or lower
Text Title: The Bird-Woman of the Lewis and Clark Expedition
Topic Probability: [(0, 0.9887783)]
Readability Score: 0.85
This text can be read by an average 4th-grader or lower
Text Title: Dr. Scudder's Tales for Little Readers, About the Heathen.
Topic Probability: [(0, 0.64098614)]
Readability Score: 5.68
This text can be read by an average 5th or 6th grader
Text Title: The Louisa Alcott Reader: a Supplementary Reader for the Fourth Year of School
Topic Probability: [(0, 0.99995804)]
Readability Score: 2.78
This text can be read by an average 4th-grader or lower
Text Title: Boy Blue and his friends, School ed.
Topic Probability: [(0, 0.999852)]
Readability Score: 6.37
This text can be read by an average 7th or 8th grader
Text Title: The Book of Nature Myths
Topic Probability: [(0,

Text Title: A First Spanish Reader
Topic Probability: [(2, 0.94355536)]
Readability Score: 8.21
This text can be read by an average 11th or 12th grader
Text Title: An Elementary Spanish Reader
Topic Probability: [(2, 0.94344085)]
Readability Score: 8.92
This text can be read by an average college student
Text Title: First Italian Readings
Topic Probability: [(2, 0.92512167)]
Readability Score: 10.52
Text Title: Contes et historiettes à l'usage des jeunes enfants
Qui commencent à savoir lire
Topic Probability: [(2, 0.99996996)]
Readability Score: 7.01
This text can be read by an average 9th or 10th grader
Text Title: The Flag of My Country. Shikéyah Bidah Na'at'a'í
Navajo New World Readers 2
Topic Probability: [(3, 0.80036134)]
Readability Score: 8.37
This text can be read by an average 11th or 12th grader
Text Title: A History of the McGuffey Readers
Topic Probability: [(1, 0.8811646)]
Readability Score: 6.37
This text can be read by an average 7th or 8th grader
Text Title: A Book of N

Text Title: Peeps at Many Lands: Norway
Topic Probability: [(0, 0.56340873)]
Readability Score: 6.87
This text can be read by an average 7th or 8th grader
Text Title: Commercial Geography
A Book for High Schools, Commercial Courses, and Business Colleges
Topic Probability: [(1, 0.9993732)]
Readability Score: 2.29
This text can be read by an average 4th-grader or lower
Text Title: A Manual of Pronunciation
For Practical Use in Schools and Families
Topic Probability: [(2, 0.8514213)]
Readability Score: 20.0
Text Title: Modern prose and poetry for secondary schools
Topic Probability: [(0, 0.9261353)]
Readability Score: 5.98
This text can be read by an average 7th or 8th grader
Text Title: A Catechism of Familiar Things; 
Their History, and the Events Which Led to Their Discovery. 
With a Short Explanation of Some of the Principal Natural Phenomena. For the Use of Schools and Families. Enlarged and Revised Edition.
Topic Probability: [(1, 0.8212872)]
Readability Score: 7.23
This text can b

In [25]:
for i in range(0, len(textdf)):
    topic = str(textdf['Topic Probability'][i])
    topic = topic[topic.find("(")+1:topic.find(")")]
    textdf['Topic Probability'][i] = topic

In [26]:
textdf['Topic Group'],textdf['Topic Probability'] = textdf['Topic Probability'].str.split(',').str

*[Return to top](#Top)*

<a id='Bag of Words Visualizations'></a>

#### Bag of Words Visualizations

In [27]:
pyLDAvis.enable_notebook()
l = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.save_html(l, 'bow.html')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.





In [28]:
import plotly.plotly as py
import plotly.graph_objs as go

trace = go.Scatter(
    y = textdf['Topic Group'],
    x = textdf['Topic Probability'],
    text=textdf['Title'],
    mode = 'markers',  
    marker=dict(size=10, color = ~textdf['Readability Group'],  
    colorscale='Viridis', 
    colorbar=dict(title = "Readability Group")
))
layout = go.Layout(
    title="Topic Group Probability by Readability (Bag of Words)",
    yaxis=dict(title='Topic Group'),
    autosize=True,
    #width=900,
    #height=500,
    xaxis=dict(title='Topic Probability')
)
data = [trace]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bagofwords')
py.plot(fig, filename = 'bagofwords', auto_open=True)

'https://plot.ly/~nobrown/36'

In [29]:
_11 = _12 = _13 = _14 = _15 = _16 = _21 = _22 = _23 = _24 = _25 = _26 = _31 = _32 = _33 = _34 = _35 = _36 = _41 = _42 = _43 = _44 = _45 = _46 = _51 = _52 = _53 = _54 = _55 = _56 = 0
for i in range(0, len(textdf)):
    # topic 0
    if textdf["Topic Group"][i] == '0' and textdf["Readability Group"][i] == 0:
        _11 += 1
    if textdf["Topic Group"][i] == '0' and textdf["Readability Group"][i] == 1:
        _12 += 1
    if textdf["Topic Group"][i] == '0' and textdf["Readability Group"][i] == 2:
        _13 += 1
    if textdf["Topic Group"][i] == '0' and textdf["Readability Group"][i] == 3:
        _14 += 1
    if textdf["Topic Group"][i] == '0' and textdf["Readability Group"][i] == 4:
        _15 += 1
    if textdf["Topic Group"][i] == '0' and textdf["Readability Group"][i] == 5:
        _16 += 1

    # Topic 1
    if textdf["Topic Group"][i] == '1' and textdf["Readability Group"][i] == 0:
        _21 += 1
    if textdf["Topic Group"][i] == '1' and textdf["Readability Group"][i] == 1:
        _22 += 1
    if textdf["Topic Group"][i] == '1' and textdf["Readability Group"][i] == 2:
        _23 += 1
    if textdf["Topic Group"][i] == '1' and textdf["Readability Group"][i] == 3:
        _24 += 1
    if textdf["Topic Group"][i] == '1' and textdf["Readability Group"][i] == 4:
        _25 += 1
    if textdf["Topic Group"][i] == '1' and textdf["Readability Group"][i] == 5:
        _26 += 1

    # Topic 2        
    if textdf["Topic Group"][i] == '2' and textdf["Readability Group"][i] == 0:
        _31 += 1
    if textdf["Topic Group"][i] == '2' and textdf["Readability Group"][i] == 1:
        _32 += 1
    if textdf["Topic Group"][i] == '2' and textdf["Readability Group"][i] == 2:
        _33 += 1
    if textdf["Topic Group"][i] == '2' and textdf["Readability Group"][i] == 3:
        _34 += 1
    if textdf["Topic Group"][i] == '2' and textdf["Readability Group"][i] == 4:
        _35 += 1
    if textdf["Topic Group"][i] == '2' and textdf["Readability Group"][i] == 5:
        _36 += 1
       
    # Topic 3
    if textdf["Topic Group"][i] == '3' and textdf["Readability Group"][i] == 0:
        _41 += 1
    if textdf["Topic Group"][i] == '3' and textdf["Readability Group"][i] == 1:
        _42 += 1
    if textdf["Topic Group"][i] == '3' and textdf["Readability Group"][i] == 2:
        _43 += 1
    if textdf["Topic Group"][i] == '3' and textdf["Readability Group"][i] == 3:
        _44 += 1
    if textdf["Topic Group"][i] == '3' and textdf["Readability Group"][i] == 4:
        _45 += 1
    if textdf["Topic Group"][i] == '3' and textdf["Readability Group"][i] == 5:
        _46 += 1
    
    # Topic 4
    if textdf["Topic Group"][i] == '4' and textdf["Readability Group"][i] == 0:
        _51 += 1
    if textdf["Topic Group"][i] == '4' and textdf["Readability Group"][i] == 1:
        _52 += 1
    if textdf["Topic Group"][i] == '4' and textdf["Readability Group"][i] == 2:
        _53 += 1
    if textdf["Topic Group"][i] == '4' and textdf["Readability Group"][i] == 3:
        _54 += 1
    if textdf["Topic Group"][i] == '4' and textdf["Readability Group"][i] == 4:
        _55 += 1
    if textdf["Topic Group"][i] == '4' and textdf["Readability Group"][i] == 5:
        _56 += 1

In [30]:
trace = go.Heatmap(z=[[_11,_12,_13,_14,_15,_16],[_21,_22,_23,_24,_25,_26],[_31,_32,_33,_34,_35,_36],[_41,_42,_43,_44,_45,_46],[_51,_52,_53,_54,_55,_56]],
                   x=['0','1','2','3','4','5'], # Readability
                   y=['0,','1','2','3','4'], # Topic
                  colorscale = "Viridis") 

layout = go.Layout(
    title = "Number of Texts in Each Topic Group By Readability Group (Bag of Words)",
    xaxis = dict(title='Readability Group'), 
    yaxis = dict(title='Topic Group'), 
    autosize=True
    #width=900, height=500
)

figure = dict(data=[trace],layout=layout)
py.iplot(figure, filename='heatmap')
py.plot(figure, filename = 'heatmapbagofwords', auto_open=True)

'https://plot.ly/~nobrown/30'

*[Return to top](#Top)*

Now let's look at just the graded readers from Gutenberg.

In [36]:
graded_titles = []
graded_texts = []
for i in range(0, len(title_list)):
    if "Reader" in title_list[i]:
        graded_titles.append(title_list[i])
        graded_texts.append(norm_text[i])

In [41]:
norm_graded_texts = []
for text in graded_texts:
    text = word_tokenize(text)
    norm_graded_texts.append(text)

In [42]:
dictionary_graded = corpora.Dictionary(norm_graded_texts)
corpus_graded = [dictionary_graded.doc2bow(text) for text in norm_graded_texts]
pickle.dump(corpus_graded, open('corpus_graded.pkl', 'wb'))
dictionary_graded.save('dictionary_graded.gensim')

In [43]:
NUM_TOPICS = 5
ldamodel_graded = gensim.models.ldamodel.LdaModel(corpus_graded, num_topics = NUM_TOPICS, id2word=dictionary_graded, passes=15)
ldamodel_graded.save('model5.gensim')
topics_graded = ldamodel_graded.print_topics(num_words=4)
for topic in topics_graded:
    print(topic)

(0, '0.013*"little" + 0.005*"day" + 0.005*"see" + 0.005*"go"')
(1, '0.007*"los" + 0.005*"¿qué" + 0.004*"la" + 0.004*"see"')
(2, '0.018*"ind" + 0.016*"sing" + 0.009*"pres" + 0.007*"pret"')
(3, '0.004*"man" + 0.004*"upon" + 0.003*"time" + 0.003*"great"')
(4, '0.004*"little" + 0.004*"like" + 0.003*"time" + 0.003*"upon"')


In [103]:
textdf_graded = pd.DataFrame(columns = ('Title','Topic Probability','Grade'))

In [104]:
for i in range(0, len(norm_graded_texts)):
    new_doc = norm_graded_texts[i]
    new_doc_bow = dictionary_graded.doc2bow(new_doc)
    title = graded_titles[i]
    print("Text Title:", title)
    topic_prob = ldamodel_graded.get_document_topics(new_doc_bow, minimum_probability=0.45)
    print("Topic Probability:", topic_prob)
    grade = 0
    textdf_graded = textdf_graded.append(pd.Series([title, topic_prob, grade], index=textdf_graded.columns), ignore_index=True)
    print('='*60)

Text Title: A Primary Reader: 
Old-time Stories, Fairy Tales and Myths Retold by Children
Topic Probability: [(0, 0.9997871)]
Text Title: Dr. Scudder's Tales for Little Readers, About the Heathen.
Topic Probability: [(4, 0.9999409)]
Text Title: The Louisa Alcott Reader: a Supplementary Reader for the Fourth Year of School
Topic Probability: [(0, 0.66756135)]
Text Title: The Flag of My Country. Shikéyah Bidah Na'at'a'í;
Navajo New World Readers 2
Topic Probability: [(2, 0.903384)]
Text Title: Chambers's Elementary Science Readers, Book I
Topic Probability: [(0, 0.50756), (4, 0.4923731)]
Text Title: The Little Lame Prince;
Rewritten for Young Readers by Margaret Waters
Topic Probability: [(3, 0.62261677)]
Text Title: The Beacon Second Reader
Topic Probability: [(0, 0.99952114)]
Text Title: The Child's World Third Reader
Topic Probability: [(0, 0.936906)]
Text Title: De La Salle Fifth Reader
Topic Probability: [(3, 0.53948516)]
Text Title: The Elson Readers, Book 5
Topic Probability: [(3,

In [105]:
for i in range(0, len(textdf_graded)):
    topic = str(textdf_graded['Topic Probability'][i])
    topic = topic[topic.find("(")+1:topic.find(")")]
    textdf_graded['Topic Probability'][i] = topic

In [106]:
textdf_graded['Topic Group'],textdf_graded['Topic Probability'] = textdf_graded['Topic Probability'].str.split(',').str

In [107]:
# Remove ones that are not in the graded readers list
textdf_graded.drop(textdf_graded.index[0], inplace=True)
textdf_graded.drop(textdf_graded.index[0], inplace=True)
textdf_graded.drop(textdf_graded.index[0], inplace=True)
textdf_graded.drop(textdf_graded.index[0], inplace=True)
textdf_graded.drop(textdf_graded.index[0], inplace=True)
textdf_graded.drop(textdf_graded.index[0], inplace=True)
textdf_graded.drop(textdf_graded.index[20], inplace=True)
textdf_graded.drop(textdf_graded.index[20], inplace=True)
textdf_graded.drop(textdf_graded.index[20], inplace=True)
textdf_graded.drop(textdf_graded.index[20], inplace=True)
textdf_graded.drop(textdf_graded.index[20], inplace=True)

In [110]:
# Get correct grade levels
textdf_graded['Grade'] = [2,3,5,5,7,1,2,3,4,5,6,1,4,3,4,9,9,2,4,3]

In [112]:
trace = go.Scatter(
    y = textdf_graded['Topic Group'],
    x = textdf_graded['Topic Probability'],
    text=textdf_graded['Title'],
    mode = 'markers',  
    marker=dict(size=10, color = ~textdf_graded['Grade'],  
    colorscale='Viridis', 
    colorbar=dict(title = "Grade")
))
layout = go.Layout(
    title="Topic Group Probability by Grade (Bag of Words)",
    yaxis=dict(title='Topic Group'),
    autosize=True,
    #width=900,
    #height=500,
    xaxis=dict(title='Topic Probability')
)
data = [trace]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='graded')
py.plot(fig, filename = 'graded', auto_open=True)

'https://plot.ly/~nobrown/38'