In [None]:
#Hillary Clinton Email Analysis LSI#

In [None]:
#data files: Emails.csv#

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import RidgeCV
from scipy.optimize import curve_fit
%matplotlib inline



In [16]:
df_emails = pd.read_csv('~/desktop/Clinton Email Data/Emails.csv', encoding="utf-8")
df_emails.info()
#we will predominently work with this data set, as this has the body text of emails#

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7945 entries, 0 to 7944
Data columns (total 22 columns):
Id                              7945 non-null int64
DocNumber                       7945 non-null object
MetadataSubject                 7649 non-null object
MetadataTo                      7690 non-null object
MetadataFrom                    7788 non-null object
SenderPersonId                  7788 non-null float64
MetadataDateSent                7813 non-null object
MetadataDateReleased            7945 non-null object
MetadataPdfLink                 7945 non-null object
MetadataCaseNumber              7945 non-null object
MetadataDocumentClass           7945 non-null object
ExtractedSubject                6260 non-null object
ExtractedTo                     3288 non-null object
ExtractedFrom                   6692 non-null object
ExtractedCc                     2640 non-null object
ExtractedDateSent               6628 non-null object
ExtractedCaseNumber             7944 non-null

In [None]:
#other analysis: 1) sentiment analysis with Naive Bayes classification 07#
#2) LSI Latent Semantic Indexing - this was the lecture on natural language processing#
#3) Flesch-Kinkaid analysis - who has the most complex sentences 
#4) Naive Bayes predict who the sender is based on the Raw Text 

In [None]:
#Let's do LSI!

In [38]:
#time to train/test/split otherwise we will have an overfitting problem
# Import train_test_split
from sklearn.cross_validation import train_test_split
# Split the data into a 70/30 train/test split
df_emails_train, df_emails_test = train_test_split(df_emails.RawText, test_size=0.3)
df_emails_sender_train, df_emails_sender_test = train_test_split(df_emails.MetadataFrom, test_size=0.3)

In [18]:
# We're not concerned with the category for now, so select out only the text column into a Series object 'docs'
docs = df_emails_train
docs.head()

480     UNCLASSIFIED U.S. Department of State Case No....
1537    UNCLASSIFIED U.S. Department of State Case No....
5320    UNCLASSIFIED U.S. Department of State Case No....
7336    UNCLASSIFIED U.S. Department of State Case No....
565     UNCLASSIFIED U.S. Department of State Case No....
Name: RawText, dtype: object

In [19]:
# Let's load the nltk English stopwords list to ignore those in our analysis
import nltk
from nltk.corpus import stopwords
## Download various nltk corpora (used for stopwords here)
#nltk.download()
## Print all english stopwords
stopwords = stopwords.words('english')

In [20]:
'''
The way we're going to attack this is to build out the TDM matrix with the documents as rows and terms as columns
and then we'll call the transpose operator to flip it to the representation we need for LSI.

We need the following:
    1.  Dictionary of word --> index to define vectors (index for each term)
    2.  Dictionary of word --> total count to get the global (IDF)
    3.  Dictionary of word --> document count for each document to get the local (TF) weighting
'''
# Implement a function that returns the 3 dictionaries that we need above
def find_frequencies(docs):
    term_indices = {} ## This is #1 above
    currentIndex = 0 ## This is the counter to make sure we correctly populate the term indices in order
    corpus_bag = {} ## This is #2 above
    doc_bags = [] ## This is the collection for #3 above
    for i, doc in docs.iteritems():
        doc_bag = {} ## This is the dictionary of term frequencies for the doc we're currently examining, doc_bags stores a collection of these
        ## TODO: Tokenize each document with nltk
        doc_tokens = nltk.word_tokenize(doc)
        ## TODO: For each token in the current document:
        for word in doc_tokens:
            ## Optionally ignore stopword and continue
            ## Throw out stopwords
            ##if word in stopwords:
                ##    continue
            ## If the word is new (not in term_indices): 
            if word not in term_indices:
                ## add it to term_indices and give it the index value currentIndex, increment currentIndex
                term_indices[word] = currentIndex
                currentIndex += 1
                ## add it to the corpus_bag with count 1
                corpus_bag[word] = 1
                ## add it to the current doc_bag with count 1
                doc_bag[word] = 1
            ## If the word is not new:
            else:
                ## increment the corpus_bag
                corpus_bag[word] = corpus_bag[word] + 1
                ## If the word is already in the doc_bag, increment that counter, else set it to 1
                if word in doc_bag:
                    doc_bag[word] = doc_bag[word] + 1
                else:
                    doc_bag[word] = 1
        doc_bags.append(doc_bag)
    return term_indices, corpus_bag, doc_bags

In [21]:
term_indices, corpus_bag, doc_bags = find_frequencies(docs)

In [22]:
print len(term_indices)
print term_indices['UNCLASSIFIED']
#print term_indices['']

75788
0


In [23]:
print len(corpus_bag)
print corpus_bag['UNCLASSIFIED']

75788
18809


In [24]:
print len(doc_bags[0])
print doc_bags[0]

91
{u'all': 1, u'Wed': 1, u'not': 1, u'PART': 1, u'Department': 2, u'Today': 2, u'Wireless': 1, u'Verizon': 1, u',': 3, u'Minyon': 1, u'31:8-9': 1, u'Burns': 1, u'<': 1, u'@': 2, u'H': 2, u'hrod17': 1, u'judge': 1, u'clintonemail.com': 1, u'Moore': 1, u'--': 1, u'B6': 1, u'Subject': 2, u'Case': 2, u'From': 2, u'State': 2, u'are': 1, u'destitute': 1, u'Sent': 3, u'out': 1, u'for': 4, u'defend': 1, u'print': 1, u';': 2, u'Proverbs': 1, u'U.S.': 2, u'RELEASE': 1, u'of': 4, u'Rob': 1, u"state.gov'": 1, u'BlackBerry': 1, u'20:13:00': 1, u"'Russorv": 1, u'Speak': 2, u'No': 4, u'June': 1, u'speak': 1, u'from': 1, u'Fw': 1, u'Jun': 1, u'.': 8, u'UNCLASSIFIED': 2, u'Date': 2, u'themselves': 1, u':': 11, u'type': 1, u'>': 1, u'Reflection': 2, u'Original': 1, u'those': 1, u'me': 1, u'10': 1, u'rights': 2, u'this': 1, u'up': 2, u'can': 1, u'F-2014-20439': 2, u'my': 1, u'and': 2, u'Thursday': 1, u"''": 2, u'To': 2, u'in': 1, u'pis': 1, u'11': 1, u'8:04': 1, u'needy': 1, u'BIG': 1, u'Strider': 1, u'

In [25]:
## Useful imports
import math
import scipy
from scipy import linalg

In [26]:
## Implement a function that uses the corpus_bag and doc_bags found above to compute the global weighting (idf) term
def compute_global_weight(corpus_bag, doc_bags):
    global_weights = {} ## A dictionary of term --> global weight (the idf components) using entropy weighting
    ## TODO: Define a variable logn which is the log base 2 of the number of documents in the set
    logn = math.log(len(doc_bags), 2)
    ## TODO: For each doc_bag:
    for doc_bag in doc_bags:
        ## TODO: For each term in the doc_bag
        for term in doc_bag:
            ## TODO: If the term is not in global_weights, initialize it with value 1
            if term not in global_weights:
                global_weights[term] = 1
            ## TODO: Calculate p_ij and increase the term's global weight by p_ij * log(p_ij) / logn
            local_count = doc_bag[term] + 0.0
            global_count = corpus_bag[term]
            pij = local_count/global_count
            global_weights[term] += pij*math.log(pij,2)/logn
    return global_weights

In [27]:
global_weights = compute_global_weight(corpus_bag, doc_bags)

In [28]:
print global_weights['UNCLASSIFIED']

0.0348528001542


In [29]:
## Finish the job with a function build_TDM that takes a Series 'docs' and outputs the TDM (a numpy matrix), make it also 
## return the term_indices and global weights as well
def build_TDM(docs):
    ## TODO: Use your first 2 functions from above to populate the term_indices, corpus_bag, doc_bags and global_weights
    term_indices, corpus_bag, doc_bags = find_frequencies(docs)
    global_weights = compute_global_weight(corpus_bag, doc_bags)
    ## TODO: For each doc_bag, generate a doc_vec and add to doc_vecs (these are the "vectors" for each document with weighting)
    doc_vecs = []
    for doc_bag in doc_bags:
        ## TODO: Initialize 'doc_vec' as a list of zeroes with 1 entry per unique term
        doc_vec = [0]*len(corpus_bag)
        ## TODO: For each term in the doc_bag, add the appropriate value into the appropriate place in the doc_vec
        ## NOTE: Need to take advantage of term_indices to get the right index, global_weights and doc_bag to get the value
        for term in doc_bag:
            index = term_indices[term]
            value = global_weights[term]*math.log(doc_bag[term] + 1.0, 2)
            doc_vec[index] = value
        doc_vecs.append(doc_vec)
    ## TODO: Generate a numpy matrix from doc_vecs, and take it's transpose to give the TDM, return that
    tdmatrix = np.matrix(doc_vecs).transpose()
    return term_indices, global_weights, tdmatrix

In [30]:
term_indices, global_weights, tdmatrix = build_TDM(docs)

In [31]:
## Run the svd to yield the full term and document space matrices
## WARNING: This is the computationally intensive step, it will take a long time, so make sure you've taken care of everything before
## this as best as possible so you don't have to do it multiple times
## Once this step is completed, essentially all the computational work is done, you have a trained LSI space!
T,sigma,D_trans = linalg.svd(tdmatrix, full_matrices=False)

In [32]:
## Truncate the resulting matrices to dimension k (you select this dimension, higher values retain more information at complexity cost)
k = 100
m = T.shape[0]
n = D_trans.shape[1]
T_k = T[0:m, 0:k]
print T_k.shape
D_trans_k = D_trans[0:k, 0:n]
print D_trans_k.shape
sigma_inv = np.linalg.inv(linalg.diagsvd(sigma, n, n))
sigma_inv_k = sigma_inv[0:k, 0:k]
print sigma_inv_k.shape

(75788, 100)
(100, 5561)
(100, 100)


In [33]:
## Function that folds a new document into an existing LSI space (space designated by global weightings, term indices, and T_k and sigma_inv_k)
def fold_doc(doc_text, term_indices, global_weights, T_k, sigma_inv_k):
    tokens = nltk.word_tokenize(doc_text)
    doc_bag = {}
    for token in tokens:
        if token in doc_bag:
            doc_bag[token] = doc_bag[token] + 1
        else:
            doc_bag[token] = 1
    a_vec = [0]*len(term_indices)
    for term in doc_bag:
        if term in term_indices:
            index = term_indices[term]
            a_vec[index] = global_weights[term]*math.log(doc_bag[term] + 1.0, 2)
    a = np.matrix(a_vec)
    folded_vec = np.dot(np.dot(a, T_k), sigma_inv_k)
    return folded_vec

In [34]:
def cosine_sim(a, b): 
    return linalg.norm(np.dot(a/linalg.norm(a), b.transpose()/linalg.norm(b)))

In [35]:
doc1 = docs[0]
doc2 = docs[2]
vec1 = fold_doc(doc1, term_indices, global_weights, T_k, sigma_inv_k)
vec2 = fold_doc(doc2, term_indices, global_weights, T_k, sigma_inv_k)
print cosine_sim(vec1, vec2)

0.959727302582


In [None]:
#okay now we have built out our space! let's do something with it! instead of the categorization problem that we used in class, lets try to predict who the SENDER is - in our dataset we want to use MetadataFrom and RawText

In [41]:
df_emails_train, df_emails_test = train_test_split(df_emails, test_size=0.3)

In [None]:
#this is where I am stuck for now - 

In [36]:
## Classify documents
def classify_docs(df_test, term_indices, global_weights, T_k, sigma_inv_k):
    test_doc_count = 0
    correct_count = 0
    ## TODO: iterate through the rows of df_test using iterrows
    for row in df_test.iterrows():
        ## TODO: Retrive the actual cat and test for each row
        test_cat = row[1]['cat']
        test_doc = row[1]['text']
        ## TODO: Fold the test document into the space to give it a vector
        folded_vec = fold_doc(test_doc, term_indices, global_weights, T_k, sigma_inv_k)
        ## TODO: Compare the resultant vectors via cosine similarity and give each test document the category 
        ## of the training document closest to it.  Print out the right and predicted categories.  Keep track of right/wrong
        best_score = -1.0
        record = None
        index = 0
        for j in xrange(0, D_trans_k.shape[1]):
            training_vec = D_trans_k[0:k, j:j+1]
            score = np.dot(folded_vec, training_vec)/linalg.norm(folded_vec)/linalg.norm(training_vec)
            if score > best_score:
                best_score = score
                record = reuters_df_train[j:j+1]
                index = j
        found_cat = record.cat[index]
        print test_cat + "," + found_cat
        test_doc_count += 1
        if test_cat==found_cat:
            correct_count += 1
    ## TODO: Print out the summary results
    print "Total Docs Test: " + str(test_doc_count)
    print "Total Correct: " + str(correct_count)
    print "Accuracy: " + str((correct_count+0.0)/test_doc_count)
    return

In [37]:
df_test = df_emails_test
classify_docs(df_test, term_indices, global_weights, T_k, sigma_inv_k)

AttributeError: 'Series' object has no attribute 'iterrows'

In [None]:
#moving to Naive Bayes analysis here - we want to be able to predict who the sender is, based on their email text 

In [10]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import RidgeCV
%matplotlib inline



In [None]:
#Import sklearn.cross_validation.train_test_split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(test_images, test_labels, test_size=0.3)
#


In [None]:
#lets try this - predict who the email is from. Use RawText column, and MetaDataFrom, see if we can predict using Naive Bayes

In [32]:
#first lets take out NaN to clean up data
#results tell us that MetaDataFrom has 7788 rows, and RawText has 7945 rows - need to take some of these out b/c there is no sender, won't be able to test on these. Let's run the on MetaDataFrom 
df_emails.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7945 entries, 0 to 7944
Data columns (total 22 columns):
Id                              7945 non-null int64
DocNumber                       7945 non-null object
MetadataSubject                 7649 non-null object
MetadataTo                      7690 non-null object
MetadataFrom                    908 non-null object
SenderPersonId                  7788 non-null float64
MetadataDateSent                7813 non-null object
MetadataDateReleased            7945 non-null object
MetadataPdfLink                 7945 non-null object
MetadataCaseNumber              7945 non-null object
MetadataDocumentClass           7945 non-null object
ExtractedSubject                6260 non-null object
ExtractedTo                     3288 non-null object
ExtractedFrom                   6692 non-null object
ExtractedCc                     2640 non-null object
ExtractedDateSent               6628 non-null object
ExtractedCaseNumber             7944 non-null 

In [55]:
#df_emails.MetadataFrom = df_emails.dropna()
#revisit this
#try to remove NaN from the entire dataset instead, see what happens
#df_emails = df_emails.dropna()
df_emails = df_emails.dropna(how='all') #'any' gets the same result

In [57]:
df_emails.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 908 entries, 2 to 7923
Data columns (total 22 columns):
Id                              908 non-null int64
DocNumber                       908 non-null object
MetadataSubject                 908 non-null object
MetadataTo                      908 non-null object
MetadataFrom                    908 non-null object
SenderPersonId                  908 non-null float64
MetadataDateSent                908 non-null object
MetadataDateReleased            908 non-null object
MetadataPdfLink                 908 non-null object
MetadataCaseNumber              908 non-null object
MetadataDocumentClass           908 non-null object
ExtractedSubject                908 non-null object
ExtractedTo                     908 non-null object
ExtractedFrom                   908 non-null object
ExtractedCc                     908 non-null object
ExtractedDateSent               908 non-null object
ExtractedCaseNumber             908 non-null object
ExtractedD

In [None]:
#revisit dropping nulls, lets see what runs first - for now we are going with 908 lines which is kind of a small number... let's see how it works 
#another way to do this is to predict the MetadataFrom as the SenderPersonId, so that it remails a numerical value - lets try this 

In [58]:
#time to train/test/split otherwise we will have an overfitting problem
# Import train_test_split
from sklearn.cross_validation import train_test_split
# Split the data into a 70/30 train/test split
X_train, X_test, y_train, y_test = train_test_split(df_emails.RawText, df_emails.SenderPersonId, test_size=0.3)

In [69]:
# Print the shape of X_train
print X_train.shape
# Print X_train
print X_train

SyntaxError: invalid syntax (<ipython-input-69-72d9cabc541e>, line 2)

In [70]:
# Print the shape of X_test
X_test.shape

(273,)

In [72]:
y_test.shape

(273,)

In [59]:
#Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#Create a CountVectorizer with binary=True so all nonzero counts are given value 1
binary_vect = CountVectorizer(decode_error = 'ignore', binary=True)
# Call fit to do our vectorization
binary_vect.fit(X_train)
# Print out all of the tokens in the dictionary
binary_vect.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'0001',
 u'001',
 u'008',
 u'00am',
 u'00b',
 u'00n',
 u'00pm',
 u'00v',
 u'01',
 u'010000000000',
 u'0102',
 u'0105',
 u'011',
 u'0129',
 u'02',
 u'0203',
 u'020510',
 u'021',
 u'0265',
 u'0271',
 u'03',
 u'0301',
 u'0308',
 u'031406z',
 u'0323',
 u'0380',
 u'0390',
 u'04',
 u'0400',
 u'0418',
 u'04841',
 u'05',
 u'0500',
 u'050710',
 u'0509',
 u'0517',
 u'055',
 u'05am',
 u'06',
 u'0600',
 u'060209',
 u'0603',
 u'0604',
 u'0605',
 u'0609',
 u'062',
 u'06397276',
 u'07',
 u'0710',
 u'0714',
 u'0730',
 u'0745',
 u'07oct',
 u'08',
 u'0801',
 u'0806',
 u'0844',
 u'0888',
 u'09',
 u'0901',
 u'0903',
 u'0905',
 u'0915',
 u'0919',
 u'0932',
 u'0bama',
 u'0g2',
 u'10',
 u'100',
 u'1000',
 u'10014',
 u'1002',
 u'100k',
 u'1011',
 u'1014',
 u'102809',
 u'1030',
 u'10458',
 u'1052',
 u'107',
 u'1071',
 u'10_public_opinion_in_perspective_russia',
 u'10am',
 u'10pm',
 u'10s',
 u'11',
 u'110',
 u'1100',
 u'110509',
 u'111',
 u'1111',
 u'1123',
 u'112709',
 u'1140',
 u'

In [60]:
# Create a frequency vectorizer
freq_vect = CountVectorizer(decode_error = 'ignore')
# Call fit to do our frequency vectorization
freq_vect.fit(X_train)
# Check out the dictionary of features
freq_vect.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'0001',
 u'001',
 u'008',
 u'00am',
 u'00b',
 u'00n',
 u'00pm',
 u'00v',
 u'01',
 u'010000000000',
 u'0102',
 u'0105',
 u'011',
 u'0129',
 u'02',
 u'0203',
 u'020510',
 u'021',
 u'0265',
 u'0271',
 u'03',
 u'0301',
 u'0308',
 u'031406z',
 u'0323',
 u'0380',
 u'0390',
 u'04',
 u'0400',
 u'0418',
 u'04841',
 u'05',
 u'0500',
 u'050710',
 u'0509',
 u'0517',
 u'055',
 u'05am',
 u'06',
 u'0600',
 u'060209',
 u'0603',
 u'0604',
 u'0605',
 u'0609',
 u'062',
 u'06397276',
 u'07',
 u'0710',
 u'0714',
 u'0730',
 u'0745',
 u'07oct',
 u'08',
 u'0801',
 u'0806',
 u'0844',
 u'0888',
 u'09',
 u'0901',
 u'0903',
 u'0905',
 u'0915',
 u'0919',
 u'0932',
 u'0bama',
 u'0g2',
 u'10',
 u'100',
 u'1000',
 u'10014',
 u'1002',
 u'100k',
 u'1011',
 u'1014',
 u'102809',
 u'1030',
 u'10458',
 u'1052',
 u'107',
 u'1071',
 u'10_public_opinion_in_perspective_russia',
 u'10am',
 u'10pm',
 u'10s',
 u'11',
 u'110',
 u'1100',
 u'110509',
 u'111',
 u'1111',
 u'1123',
 u'112709',
 u'1140',
 u'

In [61]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TFIDF vectorizer
tfidf_vect = TfidfVectorizer(decode_error = 'ignore')
# Call fit to do our frequency vectorization
tfidf_vect.fit(X_train)
# Check out the dictionary of features
tfidf_vect.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'0001',
 u'001',
 u'008',
 u'00am',
 u'00b',
 u'00n',
 u'00pm',
 u'00v',
 u'01',
 u'010000000000',
 u'0102',
 u'0105',
 u'011',
 u'0129',
 u'02',
 u'0203',
 u'020510',
 u'021',
 u'0265',
 u'0271',
 u'03',
 u'0301',
 u'0308',
 u'031406z',
 u'0323',
 u'0380',
 u'0390',
 u'04',
 u'0400',
 u'0418',
 u'04841',
 u'05',
 u'0500',
 u'050710',
 u'0509',
 u'0517',
 u'055',
 u'05am',
 u'06',
 u'0600',
 u'060209',
 u'0603',
 u'0604',
 u'0605',
 u'0609',
 u'062',
 u'06397276',
 u'07',
 u'0710',
 u'0714',
 u'0730',
 u'0745',
 u'07oct',
 u'08',
 u'0801',
 u'0806',
 u'0844',
 u'0888',
 u'09',
 u'0901',
 u'0903',
 u'0905',
 u'0915',
 u'0919',
 u'0932',
 u'0bama',
 u'0g2',
 u'10',
 u'100',
 u'1000',
 u'10014',
 u'1002',
 u'100k',
 u'1011',
 u'1014',
 u'102809',
 u'1030',
 u'10458',
 u'1052',
 u'107',
 u'1071',
 u'10_public_opinion_in_perspective_russia',
 u'10am',
 u'10pm',
 u'10s',
 u'11',
 u'110',
 u'1100',
 u'110509',
 u'111',
 u'1111',
 u'1123',
 u'112709',
 u'1140',
 u'

In [62]:
# For the binary model, training and test X matrices
binary_train_dtm = binary_vect.transform(X_train)
binary_test_dtm = binary_vect.transform(X_test)
# For the frequency model, training and test X matrices
freq_train_dtm = freq_vect.transform(X_train)
freq_test_dtm = freq_vect.transform(X_test)
# For the tfidf model, training and test X matrices
tfidf_train_dtm = tfidf_vect.transform(X_train)
tfidf_test_dtm = tfidf_vect.transform(X_test)

In [63]:
# Import
from sklearn.naive_bayes import MultinomialNB
# Create the model
mnb = MultinomialNB()
# Fit the model to the training data
mnb.fit(freq_train_dtm, y_train)
# Score the model against the test data
mnb.score(freq_test_dtm, y_test)

0.5714285714285714

In [64]:
# Create the model
tfidf_nb = MultinomialNB()
# Fit the model to our training data
tfidf_nb.fit(tfidf_train_dtm, y_train)
# Score the model against our test data
tfidf_nb.score(tfidf_test_dtm, y_test)

0.57875457875457881

In [65]:
# Import metrics
from sklearn import metrics
# Make class predictions for all observations in the test set
y_pred = mnb.predict(freq_test_dtm)
# Print classification accuracy
metrics.accuracy_score(y_pred, y_test)

0.5714285714285714

In [77]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# instantiate the vectorizer ( use variable name as vect)
vect = CountVectorizer(decode_error = 'ignore')
# Fit the vectorizer to the training set X_train
vect.fit(X_train)
# Print out the dictionary of terms with get_feature_names()
vect.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'0001',
 u'001',
 u'008',
 u'00am',
 u'00b',
 u'00n',
 u'00pm',
 u'00v',
 u'01',
 u'010000000000',
 u'0102',
 u'0105',
 u'011',
 u'0129',
 u'02',
 u'0203',
 u'020510',
 u'021',
 u'0265',
 u'0271',
 u'03',
 u'0301',
 u'0308',
 u'031406z',
 u'0323',
 u'0380',
 u'0390',
 u'04',
 u'0400',
 u'0418',
 u'04841',
 u'05',
 u'0500',
 u'050710',
 u'0509',
 u'0517',
 u'055',
 u'05am',
 u'06',
 u'0600',
 u'060209',
 u'0603',
 u'0604',
 u'0605',
 u'0609',
 u'062',
 u'06397276',
 u'07',
 u'0710',
 u'0714',
 u'0730',
 u'0745',
 u'07oct',
 u'08',
 u'0801',
 u'0806',
 u'0844',
 u'0888',
 u'09',
 u'0901',
 u'0903',
 u'0905',
 u'0915',
 u'0919',
 u'0932',
 u'0bama',
 u'0g2',
 u'10',
 u'100',
 u'1000',
 u'10014',
 u'1002',
 u'100k',
 u'1011',
 u'1014',
 u'102809',
 u'1030',
 u'10458',
 u'1052',
 u'107',
 u'1071',
 u'10_public_opinion_in_perspective_russia',
 u'10am',
 u'10pm',
 u'10s',
 u'11',
 u'110',
 u'1100',
 u'110509',
 u'111',
 u'1111',
 u'1123',
 u'112709',
 u'1140',
 u'

In [74]:
# Generate train_dtm
train_dtm = vect.transform(X_train)
# Generate test_dtm
test_dtm = vect.transform(X_test)

In [78]:
# Import MultinomialNB
from sklearn.naive_bayes import MultinomialNB
# Create model
nb = MultinomialNB()
# Fit model
nb.fit(train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [79]:
#now let's move on to LSI analysis 

In [85]:
#time to train/test/split otherwise we will have an overfitting problem
# Import train_test_split
from sklearn.cross_validation import train_test_split
# Split the data into a 70/30 train/test split
df_emails_train, df_emails_test = train_test_split(df_emails.RawText, test_size=0.3)

In [87]:
## FOR REUTERS
# We're not concerned with the category for now, so select out only the text column into a Series object 'docs'
docs = df_emails_train
docs.head()

7630    UNCLASSIFIED U.S. Department of State Case No....
572     B6\nUNCLASSIFIED U.S. Department of State Case...
5430    UNCLASSIFIED U.S. Department of State Case No....
1260    UNCLASSIFIED U.S. Department of State Case No....
7753    UNCLASSIFIED U.S. Department of State Case No....
Name: RawText, dtype: object

In [94]:
# Let's load the nltk English stopwords list to ignore those in our analysis
import nltk
from nltk.corpus import stopwords
## Download various nltk corpora (used for stopwords here)
#nltk.download()
## Print all english stopwords
stopwords = stopwords.words('english')

In [95]:
'''
The way we're going to attack this is to build out the TDM matrix with the documents as rows and terms as columns
and then we'll call the transpose operator to flip it to the representation we need for LSI.

We need the following:
    1.  Dictionary of word --> index to define vectors (index for each term)
    2.  Dictionary of word --> total count to get the global (IDF)
    3.  Dictionary of word --> document count for each document to get the local (TF) weighting
'''

# Implement a function that returns the 3 dictionaries that we need above
def find_frequencies(docs):
    term_indices = {} ## This is #1 above
    currentIndex = 0 ## This is the counter to make sure we correctly populate the term indices in order
    corpus_bag = {} ## This is #2 above
    doc_bags = [] ## This is the collection for #3 above
    for i, doc in docs.iteritems():
        doc_bag = {} ## This is the dictionary of term frequencies for the doc we're currently examining, doc_bags stores a collection of these
        ## TODO: Tokenize each document with nltk
        doc_tokens = nltk.word_tokenize(doc)
        ## TODO: For each token in the current document:
        for word in doc_tokens:
            ## Optionally ignore stopword and continue
            ## Throw out stopwords
            ##if word in stopwords:
                ##    continue
            ## If the word is new (not in term_indices): 
            if word not in term_indices:
                ## add it to term_indices and give it the index value currentIndex, increment currentIndex
                term_indices[word] = currentIndex
                currentIndex += 1
                ## add it to the corpus_bag with count 1
                corpus_bag[word] = 1
                ## add it to the current doc_bag with count 1
                doc_bag[word] = 1
            ## If the word is not new:
            else:
                ## increment the corpus_bag
                corpus_bag[word] = corpus_bag[word] + 1
                ## If the word is already in the doc_bag, increment that counter, else set it to 1
                if word in doc_bag:
                    doc_bag[word] = doc_bag[word] + 1
                else:
                    doc_bag[word] = 1
        doc_bags.append(doc_bag)
    return term_indices, corpus_bag, doc_bags

In [96]:
term_indices, corpus_bag, doc_bags = find_frequencies(docs)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 10: ordinal not in range(128)

In [98]:
print len(term_indices)

SyntaxError: invalid syntax (<ipython-input-98-4fcd0a884712>, line 1)

In [99]:
print len(doc_bags[0])
print doc_bags[0]

SyntaxError: invalid syntax (<ipython-input-99-e551b2b5c1d7>, line 1)

In [103]:
#lets take the column raw data and do some analysis on it 

df_emails.RawText.info()

AttributeError: 'Series' object has no attribute 'info'

In [106]:
df_emails.RawText.value_counts()

UNCLASSIFIED U.S. Department of State Case No. F-2014-20439 Doc No. C05758700 Date: 06/30/2015\nRELEASE IN PART\nB6\nFrom: H <hrod17@clintonemail.com>\nSent: Friday, June 5, 2009 6:08 PM\nTo: 'ValmoroLJ@state.gov.\nSubject: Fw: Invitation to Copenhagen Key to Climate Investing conference\nLet's discuss at our next mtg.\nOriginal Message\nFrom: Mills, Cheryl D <MillsCD@state.gov>\nTo: H\nSent: Thu Jun 04 14:07:55 2009\nSubject: FW: Invitation to Copenhagen Key to Climate Investing conference\nFYI\nOriginal Message\nFrom: Dick Gephardt [mailto\nSent: Thursday, June 04, 2009 7:47 AM\nTo: Mills, Cheryl D\nSubject: Re: Invitation to Copenhagen Key to Climate Investing conference\nCheryl - Thank you so much for considering this invitation. I am sure the sponsors would be absolutely thrilled to have\nSecretary Clinton appear.\nIn passing let me say what an extraordinary job you are all doing. As an American citizen it gives me great confidence\nand pride to see Secretary Clinton representing 

In [107]:
df_emails.RawText.count()

908