**Lesson 11: Feature Selection**

This is the notebook for Lesson 11: Feature Selection. I start by opening the starter code and running it from the notebook. 

In [53]:
# %load find_signature.py
#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already 
### largely processed. These files should
### have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the 
### test set (the remainder go into training)
### feature matrices changed to dense representations 
### for compatibility with classifier 
### functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test \
    = cross_validation.train_test_split(word_data, authors,\
    test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here
# I should get a decision tree up and training on 
# the training set





In [54]:
features_train[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [55]:
features_train[:,2]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [56]:
len(features_train)

150

In [57]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [58]:
pred = clf.predict(features_test)

In [59]:
len(pred)

1758

In [60]:
len(labels_train)

150

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
accuracy_score(pred, labels_test)

0.94766780432309439

In [63]:
accuracy_score(labels_test, pred)

0.94766780432309439

Find the most important features using the *feature_importance_* method. 

In [64]:
importance = clf.feature_importances_

In [65]:
importance_high = []
for score in importance: 
    if score > 0.2:
        importance_high.append(score)
        

In [66]:
importance_high

[0.76470588235294124]

In [67]:
len(importance_high)

1

In [68]:
importance

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [69]:
type(importance)

numpy.ndarray

In [70]:
import numpy as np
np.count_nonzero(importance)

4

In [71]:
np.where(importance.nonzero())

(array([0, 0, 0, 0]), array([0, 1, 2, 3]))

I have no idea what that means. 

In [72]:
itemindex = np.where(importance>=0.5)

In [73]:
print itemindex

(array([33614]),)


Use TfIdf to get the most important word. 

So to figure out what word is causing the problem I have to go back to the feature numbers I found before and get the words associated with them. We use the get_feature_names() method to get the words driving the descrimination.  

In [82]:
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'00000',
 u'0000000',
 u'00000000',
 u'000000000083213',
 u'00000365797',
 u'0000103079',
 u'00001231156',
 u'00001231162',
 u'00001246475',
 u'00002873401',
 u'0001',
 u'00015',
 u'0002',
 u'00025dth',
 u'0003',
 u'000300094',
 u'000300389',
 u'00031letdoc',
 u'000430',
 u'00046317doc',
 u'0005',
 u'00050',
 u'000500',
 u'000529',
 u'000531',
 u'0005dth',
 u'0007',
 u'00094',
 u'000960',
 u'000c05a6',
 u'000columbia',
 u'001',
 u'00102612158',
 u'00115',
 u'0013',
 u'0014',
 u'00150',
 u'00154',
 u'0016',
 u'00172794',
 u'0018',
 u'0019',
 u'001c01c09db884b56e80d16656d1ms217432',
 u'001d01c187344ece32f06900a8c0officemgr',
 u'002',
 u'0020',
 u'0021',
 u'002103',
 u'0022',
 u'00220',
 u'0023',
 u'00236',
 u'0024',
 u'002409',
 u'00244',
 u'002443',
 u'0028',
 u'002901c06012f535de80dbd54fd8soppgesupplygecom',
 u'003',
 u'0030',
 u'00305',
 u'0032',
 u'0033909',
 u'0035',
 u'003581',
 u'003589',
 u'0036',
 u'003666',
 u'003668',
 u'003680684doc',
 u'003680726

What is the TfIdf in this context? Is is the vectorizer object? Have a saved a list called TfIdf somewhere? Ok, I am going to have to go back to the start of the mini-project to get this going correctly. 