In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
reports = pd.read_csv('../data/reports/reports_v2.csv')
stats = pd.read_csv('../data/stats/stats_v2.csv')

In [3]:
# Replace whitespace
reports['name'] = reports['name'].str.replace(' ','_')
stats['name'] = stats['name'].str.replace(' ','_')

# Remove quotes because of forbidden characters in windows filenames
reports['name'] = reports['name'].str.replace('"','')
stats['name'] = stats['name'].str.replace('"','')

In [4]:
# Shift drasft numbers because of new jersey's forfeited pick
reports_shift = reports[(reports['draft_year'] == 2011) & (reports['draft_num'] >= 69)]
reports_shift['draft_num'] += 1
reports = reports[~reports['name'].isin(reports_shift['name'])]
reports = pd.concat([reports,reports_shift])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
reports2019 = reports[reports['draft_year'] == 2019]
reports_hist = reports[reports['draft_year'] != 2019]
merged = pd.merge(reports_hist.drop(columns='name'),stats,on=['draft_year','draft_num'],how='inner')
merged['NHL'] = (merged['GP'] > 0).astype(int)
#merged.head()

In [6]:
reports2019.head()

Unnamed: 0,draft_num,draft_year,name,report
235,0,2019,Cooper_Moore,Committed to the Uiniversity of North Dakota..
238,0,2019,Eric_Ciccolini,Committed to the University of of Michigan.
240,0,2019,Mike_Koster,His high school years started with him battlin...
241,0,2019,Braden_Doyle,Committed to Boston University.
242,0,2019,Maxence_Guenette,A wide framed defender with soft hands for rec...


In [7]:
merged.groupby(['draft_year','NHL']).size()

draft_year  NHL
2011        0       51
            1       93
2012        0       87
            1       88
2013        0       82
            1       90
2014        0      104
            1       69
2015        0      101
            1       74
2016        0      105
            1       45
2017        0      119
            1       26
2018        0      154
            1        8
dtype: int64

In [8]:
# Define validation set
mask = (merged['draft_year'] >= 2016) & (merged['NHL'] == False)
valid = pd.concat([reports2019,merged[mask][['draft_num','draft_year','name','report']]])

In [9]:
valid.head()

Unnamed: 0,draft_num,draft_year,name,report
235,0,2019,Cooper_Moore,Committed to the Uiniversity of North Dakota..
238,0,2019,Eric_Ciccolini,Committed to the University of of Michigan.
240,0,2019,Mike_Koster,His high school years started with him battlin...
241,0,2019,Braden_Doyle,Committed to Boston University.
242,0,2019,Maxence_Guenette,A wide framed defender with soft hands for rec...


In [10]:
train = merged[~mask]
train['NHL'].value_counts()

1    493
0    425
Name: NHL, dtype: int64

# ==============Model==============

In [11]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import metrics

In [12]:
cv = CountVectorizer()
X_train_counts = cv.fit_transform(train.report)
X_train_counts.shape

(918, 4245)

In [13]:
cv.vocabulary_.get('juniors')

2040

In [14]:
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_counts)
X_train_tfidf.shape

(918, 4245)

In [15]:
clf = MultinomialNB().fit(X_train_tfidf,train.NHL)
X_valid_counts = cv.transform(valid.report)
X_valid_tfidf = tfidf.transform(X_valid_counts)
predicted = clf.predict(X_valid_tfidf)

In [16]:
valid['NHL'] = predicted
valid[valid['draft_year'] == 2019].NHL.value_counts()

1    153
0     22
Name: NHL, dtype: int64

In [25]:
np.mean(clf.predict(X_train_tfidf) == train.NHL)

0.7331154684095861

In [19]:
# Naive Bayes
clf1 = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',MultinomialNB())
])

X_train,X_test,y_train,y_test = train_test_split(train.report,train.NHL,test_size=0.33,random_state=6)
clf1.fit(X_train,y_train)
predicted = clf1.predict(X_test)
np.mean(predicted == y_test)

0.5742574257425742

In [24]:
np.mean(clf1.predict(train.report) == train.NHL)

0.6982570806100218

In [45]:
# Linear SVM
clf2 = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',SGDClassifier(penalty='l2',alpha=1e-3,
                         max_iter=5,tol=None))
])

clf2.fit(train.report,train.NHL)
predicted = clf2.predict(X_test)
np.mean(predicted == y_test)

0.9273927392739274

In [48]:
np.mean(clf2.predict(train.report) == train.NHL)

0.9019607843137255

In [38]:
train[clf2.predict(train.report) != train.NHL]

Unnamed: 0,draft_num,draft_year,report,A,G,GP,PIM,Pts,draft_team,junior_team,name,pos,round_num,NHL
5,48,2013,Versatile player with willingness to aggressiv...,0,0,0,0,0,Detroit,Owen Sound Attack [OHL],Zach_Nastasiuk,R,2,0
12,209,2014,Gifted scorer who has a knack for scoring. Ver...,0,0,0,0,0,Los Angeles,Kingston Frontenacs [OHL],Spencer_Watson,R,7,0
58,141,2016,Oversized lanky wing with surprising feet and ...,0,0,4,0,0,NY Rangers,Soo Greyhounds [OHL],Tim_Gettinger,L,5,1
123,76,2016,Committed to the University of Minnesota.,0,0,1,2,0,Nashville,Muskegon Lumberjacks [USHL],Rem_Pitlick,C,3,1
348,201,2015,Under six foot carrier who has very good puck ...,0,0,0,0,0,Minnesota,Soo Greyhounds [OHL],Gustav_Bouramman,D,7,0
364,168,2015,Second year eligible who was a disruptive forc...,7,3,36,4,10,Winnipeg,Tri-City Storm [USHL],Mason_Appleton,C,6,1
371,162,2015,"High energy wing with great wheels, strong han...",0,0,0,0,0,Florida,Tri-City Storm [USHL],Chris_Wilkie,R,6,0
376,150,2015,Blossoming rearguard with great all directiona...,0,0,0,0,0,Tampa Bay,Sioux City Musketeers [USHL],Ryan_Zuhlsdorf,D,5,0
388,137,2015,Third year eligible centre who never takes a s...,30,12,109,34,42,Pittsburgh,Plzen HC [Czech],Dominik_Simon,C,5,1
390,135,2015,Undersized fireplug of a wing who was the numb...,0,0,0,0,0,Minnesota,Novokuznetsk Metallurg [KHL],Kirill_Kaprizov,L,5,0


In [88]:
print(metrics.classification_report(y_test,predicted))

              precision    recall  f1-score   support

           0       0.61      0.53      0.57       143
           1       0.63      0.70      0.66       160

   micro avg       0.62      0.62      0.62       303
   macro avg       0.62      0.62      0.62       303
weighted avg       0.62      0.62      0.62       303



In [211]:
params = {
    'vect__ngram_range':[(1,1),(1,2)],
    'tfidf__use_idf':(True,False),
    'clf__alpha':(1e-2,1e-3),
    'clf__loss':('hinge','log'),
    'clf__penalty':('l1','l2','elasticnet'),
}
gs_clf = GridSearchCV(clf2,params,cv=5,iid=False,n_jobs=-1)
gs_clf.fit(train.report,train.NHL)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ndom_state=1, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001), 'clf__loss': ('hinge', 'log'), 'clf__penalty': ('l1', 'l2', 'elasticnet')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [246]:
gs_clf.best_score_

0.6263839391779521

In [243]:
best_clf = gs_clf.best_estimator_
print(gs_clf.best_params_)

{'clf__alpha': 0.001, 'clf__loss': 'log', 'clf__penalty': 'elasticnet', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [244]:
print(best_clf.steps)

[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False,
         use_idf=False)), ('clf', SGDClassifier(alpha=0.001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
       power_t=0.5, random_state=1, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]


In [214]:
train_predictions = best_clf.fit(train.report,train.NHL).predict(train.report)
valid_predictions = best_clf.fit(train.report,train.NHL).predict(valid.report)

In [215]:
np.mean(train_predictions == train.NHL)

0.6993464052287581

In [216]:
# Erroneous predictions
train['predictions'] = train_predictions
train[train.NHL != train.predictions]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,draft_num,draft_year,report,A,G,GP,PIM,Pts,draft_team,junior_team,name,pos,round_num,NHL,predictions
3,51,2013,Strong large molded Swedish defenseman that is...,9,0,49,6,9,Chicago,Linkoping Jrs. (Sweden),Carl_Dahlstrom,D,2,1,0
4,59,2011,More of a stay at home defender who does have ...,0,0,0,0,0,Florida,Rogle BK Angelholm [Swe-1],Rasmus_Bengtsson,D,2,0,1
5,48,2013,Versatile player with willingness to aggressiv...,0,0,0,0,0,Detroit,Owen Sound Attack [OHL],Zach_Nastasiuk,R,2,0,1
6,47,2013,"Aggressive offensive defender with a top gear,...",0,0,0,0,0,St. Louis,Minnetonka (Minn. H.S.),Tommy_Vannelli,D,2,0,1
7,44,2013,Athletic hybrid goaler with good size and abil...,2,0,29,4,2,Pittsburgh,Edmonton Oil Kings [WHL],Tristan_Jarry,G,2,1,0
9,49,2013,"Displays good speed, and offensive gifts. A de...",0,0,0,0,0,San Jose,Baie-Comeau Drakkar [QMJHL],Gabryel_Boudreau,L,2,0,1
12,209,2014,Gifted scorer who has a knack for scoring. Ver...,0,0,0,0,0,Los Angeles,Kingston Frontenacs [OHL],Spencer_Watson,R,7,0,1
58,141,2016,Oversized lanky wing with surprising feet and ...,0,0,4,0,0,NY Rangers,Soo Greyhounds [OHL],Tim_Gettinger,L,5,1,0
123,76,2016,Committed to the University of Minnesota.,0,0,1,2,0,Nashville,Muskegon Lumberjacks [USHL],Rem_Pitlick,C,3,1,0
198,39,2016,Hampered by the graduation of his former linem...,59,69,164,21,128,Chicago,Erie Otters [OHL],Alex_DeBrincat,L,2,1,0


In [245]:
valid['NHL'] = valid_predictions
valid[valid.draft_year == 2019]

Unnamed: 0,draft_num,draft_year,name,report,NHL
235,0,2019,Cooper_Moore,Committed to the Uiniversity of North Dakota..,0
238,0,2019,Eric_Ciccolini,Committed to the University of of Michigan.,0
240,0,2019,Mike_Koster,His high school years started with him battlin...,1
241,0,2019,Braden_Doyle,Committed to Boston University.,0
242,0,2019,Maxence_Guenette,A wide framed defender with soft hands for rec...,1
243,0,2019,Jackson_Millar,Committed to Colorado College.,0
244,0,2019,Jeremi_Gerber,Strong skating Swiss winger with a powerful sm...,1
245,0,2019,Drew_Helleson,Tall USA National Development defenseman with ...,1
246,0,2019,Danny_Weight,"Doug's son has good size, vision, hands and a ...",0
247,0,2019,Kyle_Topping,Under six foot centre with a smooth skating st...,1


In [218]:
vect = best_clf.get_params()['vect']
tfidf = best_clf.get_params()['tfidf']
clf = best_clf.get_params()['clf']
class_labels = best_clf.classes_
feature_names = vect.get_feature_names()

In [228]:
# Negative features
[feature_names[i] for i in clf.coef_[0].argsort()[:100]]

['good',
 'game',
 'university',
 'term',
 'has',
 'developing',
 'offensive',
 'big',
 'development',
 'size and',
 'his feet',
 'some',
 'feet',
 'balance',
 'all',
 'need',
 'the university',
 'long term',
 'defensive',
 'goaltender',
 'and is',
 'physical',
 'who has',
 'raw',
 'but',
 'nice',
 'must',
 'and good',
 'solid',
 'wall',
 'committed to',
 'university of',
 'zone',
 'to develop',
 'prospect',
 'far',
 'projects as',
 'committed',
 'like',
 'as',
 'developmental',
 'two way',
 'projects',
 'improving',
 'abilities',
 'on his',
 'hard',
 'displays',
 'passes',
 'far from',
 'has committed',
 'if',
 'prospect with',
 'long',
 'the cycle',
 'positioning',
 'develop',
 'this',
 'way',
 'with the',
 'physicality',
 'cycle',
 'wing',
 'things',
 'does',
 'shutdown',
 'they',
 'that will',
 'but already',
 'has some',
 'and offensive',
 'open',
 'size',
 'school',
 'as he',
 'usa',
 'also',
 'second',
 'in front',
 'frame',
 'time to',
 'where',
 'winger',
 'continue',
 'who is

In [242]:
# Positive features
[feature_names[i] for i in clf.coef_[0].argsort()[-100:][::-1]]

['he',
 'great',
 'high',
 'in',
 'quick',
 'with',
 'the',
 'on',
 'stick',
 'him',
 'speed',
 'being',
 'scorer',
 'you',
 'line',
 'he is',
 'excellent',
 'end',
 'many',
 'are',
 'skater who',
 'due',
 'skater',
 'due to',
 'top',
 'get',
 'foot',
 'go',
 'but he',
 'set',
 'upside',
 'team',
 'how',
 'net',
 'smart',
 'early',
 'that',
 'and',
 'the best',
 'player',
 'world',
 'nhl',
 'with excellent',
 'able',
 'draft',
 'speed and',
 'the puck',
 'players',
 'attack',
 'shift',
 'release',
 'defenders',
 'able to',
 'undersized',
 'hockey',
 'the net',
 'his skating',
 'set up',
 'can',
 'vision and',
 'player who',
 'play',
 'shot',
 'of the',
 'to get',
 'strong',
 'junior',
 'his team',
 'improvement',
 'jump',
 'offensive upside',
 'well',
 'and has',
 'accurate',
 'vision',
 'without',
 'very',
 'junior under',
 'getting',
 'passing',
 'be the',
 'moves',
 'world junior',
 'with his',
 'moves well',
 'whose',
 'three',
 'have',
 'when',
 'space',
 'needs more',
 'and will'

In [239]:
clf.coef_[0][13108]

1.9763950475962724

# ==============NHL Corpus==============

In [1]:
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
import random

In [2]:
nhl = CategorizedPlaintextCorpusReader(root='../data/NHLcorpus/',fileids=r'.*\.txt',cat_pattern='(\w+)/*')

In [3]:
documents = [(list(nhl.words(fileid)),category)
             for category in nhl.categories()
             for fileid in nhl.fileids(category)]
random.shuffle(documents)

In [4]:
all_words = nltk.FreqDist(w.lower() for w in nhl.words())
all_words.most_common(1000)

[('.', 4755),
 ('and', 3597),
 ('the', 3332),
 (',', 2812),
 ('a', 2410),
 ('to', 2085),
 ('he', 1842),
 ('his', 1762),
 ('in', 1451),
 ('is', 1236),
 ('with', 1058),
 ('-', 1025),
 ('has', 939),
 ('of', 907),
 ('who', 888),
 ('on', 804),
 ('good', 780),
 ('but', 573),
 ('puck', 567),
 ('as', 561),
 ('strong', 455),
 ('an', 414),
 ('at', 410),
 ('for', 402),
 ('that', 401),
 ('game', 400),
 ('player', 391),
 ('him', 369),
 ('can', 364),
 ('be', 347),
 ('will', 334),
 ('plays', 331),
 ('well', 301),
 ('up', 299),
 ('offensive', 297),
 ('play', 295),
 ('all', 289),
 ('big', 279),
 ("'", 268),
 ('excellent', 257),
 ('shot', 248),
 ('very', 248),
 ('needs', 240),
 ('size', 235),
 ('more', 230),
 ('was', 229),
 ('get', 224),
 ('t', 219),
 ('speed', 219),
 ('team', 215),
 ('it', 215),
 ('not', 215),
 ('zone', 207),
 ('when', 205),
 ('ice', 201),
 ('skills', 201),
 ('quick', 198),
 ('work', 194),
 ('defensive', 190),
 ('this', 188),
 ('long', 186),
 ('feet', 182),
 ('one', 181),
 ('hands', 18

In [5]:
word_features = [w[0] for w in all_words.most_common(1000)]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = word in document_words
    return features

In [6]:
document_features(nhl.words(nhl.fileids('false')[86]))

{'contains(.)': True,
 'contains(and)': True,
 'contains(the)': True,
 'contains(,)': True,
 'contains(a)': True,
 'contains(to)': True,
 'contains(he)': True,
 'contains(his)': True,
 'contains(in)': True,
 'contains(is)': True,
 'contains(with)': False,
 'contains(-)': True,
 'contains(has)': True,
 'contains(of)': True,
 'contains(who)': True,
 'contains(on)': False,
 'contains(good)': False,
 'contains(but)': True,
 'contains(puck)': False,
 'contains(as)': False,
 'contains(strong)': False,
 'contains(an)': True,
 'contains(at)': True,
 'contains(for)': True,
 'contains(that)': False,
 'contains(game)': False,
 'contains(player)': True,
 'contains(him)': False,
 'contains(can)': True,
 'contains(be)': False,
 'contains(will)': False,
 'contains(plays)': True,
 'contains(well)': False,
 'contains(up)': False,
 'contains(offensive)': False,
 'contains(play)': False,
 'contains(all)': False,
 'contains(big)': False,
 "contains(')": False,
 'contains(excellent)': False,
 'contains(sho

In [7]:
random.shuffle(documents)
featuresets = [(document_features(d),c) for (d,c) in documents]
train_set,test_set = featuresets[818:],featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.54

In [8]:
classifier.show_most_informative_features(1000)

Most Informative Features
          contains(will) = True             true : false  =      5.8 : 1.0
         contains(front) = True             true : false  =      5.0 : 1.0
        contains(season) = True             true : false  =      5.0 : 1.0
      contains(attacker) = True             true : false  =      5.0 : 1.0
         contains(solid) = True             true : false  =      5.0 : 1.0
          contains(open) = True             true : false  =      4.3 : 1.0
         contains(tools) = True            false : true   =      4.3 : 1.0
       contains(himself) = True            false : true   =      4.3 : 1.0
          contains(have) = True             true : false  =      4.1 : 1.0
        contains(scorer) = True             true : false  =      3.8 : 1.0
         contains(seems) = True             true : false  =      3.7 : 1.0
         contains(doesn) = True             true : false  =      3.7 : 1.0
      contains(progress) = True            false : true   =      3.7 : 1.0

# ==========Filtering Words NHL==========

In [9]:
from nltk.corpus import stopwords
import pickle
import re

In [10]:
# Filter stop words
# Convert to lower case for case insensitive matching
# Remove punctuation
stop_words = set(stopwords.words('english'))

documents_filtered = [([re.sub(r'\W+','',w.lower()) for  w in nhl.words(fileid) if w.lower() not in stop_words],category)
                      for category in nhl.categories()
                      for fileid in nhl.fileids(category)]

In [11]:
words_filtered = nltk.FreqDist(re.sub(r'\W+','',w.lower()) for w in nhl.words() if w.lower() not in stop_words)
words_filtered.most_common(1000)

[('', 9279),
 ('good', 780),
 ('puck', 567),
 ('strong', 455),
 ('game', 400),
 ('player', 391),
 ('plays', 331),
 ('well', 301),
 ('offensive', 297),
 ('play', 295),
 ('big', 279),
 ('excellent', 257),
 ('shot', 248),
 ('needs', 240),
 ('size', 235),
 ('get', 224),
 ('speed', 219),
 ('team', 215),
 ('zone', 207),
 ('ice', 201),
 ('skills', 201),
 ('quick', 198),
 ('work', 194),
 ('defensive', 190),
 ('long', 186),
 ('feet', 182),
 ('one', 181),
 ('hands', 180),
 ('skating', 178),
 ('defender', 176),
 ('junior', 176),
 ('ability', 175),
 ('may', 171),
 ('high', 171),
 ('end', 164),
 ('displays', 162),
 ('great', 161),
 ('wing', 157),
 ('attack', 157),
 ('stick', 152),
 ('hard', 147),
 ('world', 147),
 ('prospect', 146),
 ('way', 139),
 ('top', 135),
 ('two', 133),
 ('solid', 131),
 ('point', 131),
 ('first', 130),
 ('time', 130),
 ('term', 129),
 ('vision', 128),
 ('upside', 128),
 ('strength', 121),
 ('power', 120),
 ('make', 119),
 ('wall', 119),
 ('man', 118),
 ('nhl', 117),
 ('year

In [12]:
word_features = [w[0] for w in words_filtered.most_common(1000)]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = word in document_words
    return features

In [13]:
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
classifier_filtered = nltk.NaiveBayesClassifier.train(train_filtered)
nltk.classify.accuracy(classifier_filtered,test_filtered)

0.58

In [14]:
classifier_filtered.show_most_informative_features(100)

Most Informative Features
           contains(get) = True             true : false  =      6.0 : 1.0
          contains(high) = True             true : false  =      5.2 : 1.0
         contains(shows) = True             true : false  =      5.2 : 1.0
          contains(like) = True             true : false  =      4.7 : 1.0
     contains(defenders) = True             true : false  =      4.7 : 1.0
        contains(upside) = True             true : false  =      4.3 : 1.0
          contains(much) = True             true : false  =      3.9 : 1.0
          contains(jump) = True             true : false  =      3.9 : 1.0
         contains(stick) = True             true : false  =      3.8 : 1.0
         contains(hands) = True             true : false  =      3.8 : 1.0
         contains(named) = True            false : true   =      3.8 : 1.0
       contains(package) = True            false : true   =      3.8 : 1.0
    contains(tremendous) = True            false : true   =      3.8 : 1.0

In [15]:
# Overall most important features
nb_filtered = nltk.NaiveBayesClassifier.train(featuresets_filtered)
nb_filtered.show_most_informative_features(1000)

Most Informative Features
       contains(juniors) = True             true : false  =      6.6 : 1.0
       contains(without) = True             true : false  =      6.0 : 1.0
       contains(dynamic) = True             true : false  =      6.0 : 1.0
      contains(dominant) = True             true : false  =      5.7 : 1.0
           contains(say) = True             true : false  =      5.5 : 1.0
        contains(reacts) = True             true : false  =      5.5 : 1.0
          contains(turn) = True             true : false  =      5.5 : 1.0
        contains(finish) = True             true : false  =      5.5 : 1.0
          contains(pace) = True             true : false  =      5.1 : 1.0
         contains(panic) = True             true : false  =      4.9 : 1.0
       contains(options) = True             true : false  =      4.9 : 1.0
         contains(rough) = True             true : false  =      4.9 : 1.0
    contains(defensemen) = True             true : false  =      4.3 : 1.0

In [16]:
nltk.classify.accuracy(nb_filtered,featuresets_filtered)

0.7145969498910676

In [48]:
save_classifier = open('..\classifiers\\nltk_nb_filter.pickle','wb')
pickle.dump(nb_filtered,save_classifier)
save_classifier.close()

# ==========Sklearn Classifiers==========

In [20]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.svm import LinearSVC

In [30]:
SVC_classifier = SklearnClassifier(LinearSVC())
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
SVC_classifier.train(train_filtered)
nltk.classify.accuracy(SVC_classifier,featuresets_filtered)

0.5904139433551199

In [31]:
MN_classifier = SklearnClassifier(MultinomialNB())
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
MN_classifier.train(train_filtered)
nltk.classify.accuracy(MN_classifier,featuresets_filtered)

0.6176470588235294

In [32]:
BN_classifier = SklearnClassifier(BernoulliNB())
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
BN_classifier.train(train_filtered)
nltk.classify.accuracy(BN_classifier,featuresets_filtered)

0.5958605664488017