In [2]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import mglearn 
from IPython.display import display 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Perceptron 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.datasets import load_files
%matplotlib 
%matplotlib inline


Using matplotlib backend: MacOSX


In [3]:
#Loading experiment title/description from data folder
dataset=load_files('data')

In [8]:
#splitting 'yes'/'no' files ramdomly into training and test set
from sklearn.model_selection import StratifiedShuffleSplit 
docs_train, docs_test, y_train, y_test =train_test_split(dataset.data, dataset.target,test_size=0.25, train_size=0.75,random_state=10)
print len(docs_train)
print len(docs_test)
print len(y_train)
print len(y_test)

vect = TfidfVectorizer(strip_accents='unicode', analyzer='word',ngram_range=(1,2),stop_words="english").fit(docs_train)
X_train = vect.transform(docs_train)
print("Feature matrix:\n{}".format(repr(X_train)))

print type(vect.get_feature_names())
print len(vect.get_feature_names())
print vect.get_feature_names()[:10]
print vect.vocabulary_['00 night']

#feature_names=np.array(lr_vectorizer.get_feature_names())

1784
595
1784
595
Feature matrix:
<1784x79774 sparse matrix of type '<type 'numpy.float64'>'
	with 177753 stored elements in Compressed Sparse Row format>
<type 'list'>
79774
[u'00', u'00 00', u'00 20', u'00 800', u'00 ad', u'00 day', u'00 final', u'00 night', u'000', u'000 50']
7


# Logistic Regression 


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,make_scorer,fbeta_score
from sklearn.pipeline import Pipeline
import mglearn



clf=Pipeline([('vect', TfidfVectorizer(strip_accents='unicode', analyzer='word',ngram_range=(1,2),stop_words="english")),
             ('logreg',LogisticRegression(C=100)),
])

parameters={'vect__ngram_range':[(1,1),(1,2),(1,3)],
            'vect__max_df':[.7,.8,.9],
            #'vect__min_df':[.1,.2,.3],
            'logreg__C':[1,10,100]
}

#Initializing scorer method 
MYBETA=4
f_scorer=make_scorer(fbeta_score,beta=MYBETA,pos_label=1)

#Running GridSearch
lr_gs_clf=GridSearchCV(clf,parameters,n_jobs=-1,scoring=f_scorer)
lr_gs_clf=lr_gs_clf.fit(docs_train,y_train)
lr_trainedCLF=lr_gs_clf.best_estimator_
lr_vectorizer=lr_gs_clf.best_estimator_.named_steps["vect"]


for param_name in sorted(parameters.keys()):
    print("%s:%r"%(param_name,lr_gs_clf.best_params_[param_name]))
print("\n")

#feature_names found by the vectorizer
print type(lr_vectorizer.get_feature_names())
print len(lr_vectorizer.get_feature_names())
print lr_vectorizer.get_feature_names()[:10]


In [None]:
lr_y_predicted=lr_trainedCLF.predict(docs_test)
lr_y_train_predicted=lr_trainedCLF.predict(docs_train)
lr_vectorizer=lr_gs_clf.best_estimator_.named_steps["vect"]
print lr_vectorizer.vocabulary_['00 00 night']
print type(lr_vectorizer.get_feature_names())
print len(lr_vectorizer.get_feature_names())
print lr_vectorizer.get_feature_names()[:10]
feature_names=np.array(lr_vectorizer.get_feature_names())

In [None]:
#Printing Classification Report for Test Set 
print("Test Set Classification Report & Confusion Matrix: \n")
print(metrics.classification_report(y_test, lr_y_predicted,target_names=['yes','no'],labels=[1,0], ))
print "F1: %5.3f"%f1_score(y_test,lr_y_predicted,pos_label=1)

#Printing Confusion Matrix for Test Set 
cm=metrics.confusion_matrix( lr_y_predicted, y_test, labels=[1,0])
print(cm)
print("\n")

#Printing Classification Report for Training Set 
print("Training Set Classification Report & Confusion Matrix: \n")
print(metrics.classification_report(y_train, lr_y_train_predicted,target_names=['yes','no'],labels=[1,0], ))


#Printing Confusion Matrix for Training Set 
cm=metrics.confusion_matrix( lr_y_train_predicted, y_train, labels=[1,0])
print(cm)


#lr_trainedCLF.predict_proba(docs_train)
print lr_trainedCLF.named_steps["logreg"].coef_.shape
print lr_trainedCLF.named_steps["logreg"].coef_
print
print feature_names.shape
print feature_names

In [None]:
mglearn.tools.visualize_coefficients(
   lr_trainedCLF.named_steps["logreg"].coef_, 
    feature_names, n_top_features=30
)


# LinearSVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score,make_scorer,fbeta_score

In [None]:
#Scorer method
MYBETA=4
f_scorer=make_scorer(fbeta_score,beta=MYBETA,pos_label=1)

#Running Grid Search
clf=Pipeline([('lsvect', TfidfVectorizer(strip_accents='unicode', analyzer='word',stop_words="english", ngram_range=(1,2),max_df=.7,min_df=.1)),
             ('linSVC', LinearSVC()),
])

parameters={'lsvect__ngram_range':[(1,1),(1,2)],
            'lsvect__max_df':[.7,.8,.9],
            'lsvect__min_df':[.1,.2,.3],
            'linSVC__C':[1,10,100,100]
}

ls_grids_clf=GridSearchCV(clf,parameters,n_jobs=-1,scoring=f_scorer)
ls_grids_clf=ls_grids_clf.fit(docs_train,y_train)
ls_vectorizer=ls_grids_clf.best_estimator_.named_steps["lsvect"]
ls_feature_names=np.array(ls_vectorizer.get_feature_names())
ls_trainedCLF=ls_grids_clf.best_estimator_

ls_y_predicted=ls_trainedCLF.predict(docs_test)
ls_y_predict=ls_trainedCLF.predict(docs_train)



In [None]:
#Printing Classification Report for Test Set 
print("Test Set Classification Report & Confusion Matrix: \n")
print(metrics.classification_report(y_test, ls_y_predicted,target_names=['yes','no'],labels=[1,0], ))
print "F1: %5.3f"%f1_score(y_test,ls_y_predicted,pos_label=1)

#Printing Confusion Matrix for Test Set 
cm=metrics.confusion_matrix(  ls_y_predicted, y_test, labels=[1,0])
print(cm)



In [None]:
mglearn.tools.visualize_coefficients(
   ls_trainedCLF.named_steps["linSVC"].coef_, 
   ls_feature_names, n_top_features=10
    
)

## Perceptron

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,make_scorer,fbeta_score

#Initializing Scorer Method 
MYBETA=2
f_scorer=make_scorer(fbeta_score,beta=MYBETA,pos_label=1)

#Running GridSearch 
clf=Pipeline([ 
               ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='word', stop_words="english")), 
               ('Prcpt', Perceptron()),
])

parameters={  'vect__ngram_range':[(1,1),(1,2)],
              'vect__max_df':[.7,.8,.9],
              'vect__min_df':[.1,.2,.3],
              'Prcpt__alpha':[.1,.01,.001,.0001] 
           }

prcpt_gs_clf=GridSearchCV(clf,parameters,n_jobs=-1,scoring=f_scorer)
prcpt_gs_clf=prcpt_gs_clf.fit(docs_train,y_train)
prcpt_vectorizer=prcpt_gs_clf.best_estimator_.named_steps["vect"]
prcpt_feature_names=np.array(prcpt_vectorizer.get_feature_names())
print ("Best params: \n")
print(prcpt_gs_clf.best_params_)


In [None]:
prcpt_trainedCLF=prcpt_gs_clf.best_estimator_
prcpt_y_predicted=prcpt_trainedCLF.predict(docs_test)
prcpt_y_predict=prcpt_trainedCLF.predict(docs_train)

In [None]:
#Printing Classification Report & Confusion Matrix for Test Set 
print("Test set: \n")
print(metrics.classification_report(y_test, prcpt_y_predicted, target_names=['yes','no'], labels=[1,0]))
cm=metrics.confusion_matrix( prcpt_y_predicted, y_test, labels=[1,0])
print(cm)

In [None]:
mglearn.tools.visualize_coefficients(
   prcpt_trainedCLF.named_steps["Prcpt"].coef_, 
    prcpt_feature_names, n_top_features=20
)
plt.title("Perceptron")

# Decision Tree Classifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit 

dataset=load_files('data')
docs_train, docs_test, y_train, y_test =train_test_split(dataset.data, dataset.target,test_size=0.25, train_size=0.75,random_state=10)

clf=Pipeline([
             ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='word',stop_words="english", ngram_range=(1,2),max_df=.7,min_df=.1)
             )
             #('deci', DecisionTreeClassifier(random_state=0))
])

X_train_counts=clf.fit(docs_train)
tree=DecisionTreeClassifier(max_depth=6000, random_state=0)
tree.fit(X_train_counts, y_train)
print ("Accuracy on training set: {:3f}".format(tree.score(X_train_counts,y_train)))
print ("Accuracy on test set: {:3f}".format(tree.score(X_test,y_test)))