In [31]:
# %load find_signature.py
#!/usr/bin/python3

import joblib
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = joblib.load( open(words_file, "rb"))
authors = joblib.load( open(authors_file, "rb") )


In [32]:


### test_size is the percentage of events assigned to the test set 
### (the remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


In [33]:
print(features_train.shape)

(15820, 37861)


In [34]:
### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



In [35]:
### your code goes here
from sklearn.tree import DecisionTreeClassifier
dt_less_features = DecisionTreeClassifier()
dt_less_features.fit(features_train, labels_train)
accuracy_overfit = dt_less_features.score(features_test, labels_test)
print(accuracy_overfit)

0.8168373151308305


In [36]:
### Take your (overfit) decision tree and use the feature_importances_ attribute to get 
### a list of the relative importance of all the features being used. 
### We suggest only printing out the feature importance if it’s above (0.2). 
### What’s the importance of the most important feature? What is the number of this feature?
all_feature_importances = dt_less_features.feature_importances_
print([(f, i) for i, f in enumerate(all_feature_importances) if f >= 0.2])


[(0.36363636363636365, 21323)]


In [39]:
### In order to figure out what words are causing the problem, you need to go back to the TfIdf and 
### use the feature numbers that you obtained in the previous part of the mini-project to get the associated words. 
### You can return a list of all the words in the TfIdf by calling get_feature_names() on it; 
### Pull out the word that’s causing most of the discrimination of the decision tree. 
### What is it? Does it make sense as a word that’s uniquely tied to either Chris Germany or Sara Shackleton, a signature of sorts?

all_words = vectorizer.get_feature_names()
print(all_words[21323])

houectect


In [38]:
# This word seems like an outlier in a certain sense, so let’s remove it and refit. 
# Go back to text_learning/vectorize_text.py, and remove this word from the emails 
# using the same method you used to remove “sara”, “chris”, etc. 
# Rerun vectorize_text.py, and once that finishes, rerun find_signature.py. 
# Any other outliers pop up? What word is it?