In [1]:
import pandas as pd
import os
import re

In [8]:
reports = pd.read_csv('../data/reports/reports_v2.csv')
stats = pd.read_csv('../data/stats/stats_v2.csv')

In [9]:
# Replace whitespace
reports['name'] = reports['name'].str.replace(' ','_')
stats['name'] = stats['name'].str.replace(' ','_')

# Remove quotes because of forbidden characters in windows filenames
reports['name'] = reports['name'].str.replace('"','')
stats['name'] = stats['name'].str.replace('"','')

In [10]:
# Shift drasft numbers because of new jersey's forfeited pick
reports_shift = reports[(reports['draft_year'] == 2011) & (reports['draft_num'] >= 69)]
reports_shift['draft_num'] += 1
reports = reports[~reports['name'].isin(reports_shift['name'])]
reports = pd.concat([reports,reports_shift])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
reports2019 = reports[reports['draft_year'] == 2019]
reports_hist = reports[reports['draft_year'] != 2019]
merged = pd.merge(reports_hist.drop(columns='name'),stats,on=['draft_year','draft_num'],how='inner')
merged['NHL'] = (merged['GP'] > 0).astype(int)
#merged.head()

In [23]:
reports2019.head()

Unnamed: 0,draft_num,draft_year,name,report
235,0,2019,Cooper_Moore,Committed to the Uiniversity of North Dakota..
238,0,2019,Eric_Ciccolini,Committed to the University of of Michigan.
240,0,2019,Mike_Koster,His high school years started with him battlin...
241,0,2019,Braden_Doyle,Committed to Boston University.
242,0,2019,Maxence_Guenette,A wide framed defender with soft hands for rec...


In [24]:
merged.groupby(['draft_year','NHL']).size()

draft_year  NHL
2011        0       51
            1       93
2012        0       87
            1       88
2013        0       82
            1       90
2014        0      104
            1       69
2015        0      101
            1       74
2016        0      105
            1       45
2017        0      119
            1       26
2018        0      154
            1        8
dtype: int64

In [25]:
# Define validation set
mask = (merged['draft_year'] >= 2016) & (merged['NHL'] == False)
valid = pd.concat([reports2019,merged[mask][['draft_num','draft_year','name','report']]])

In [26]:
valid.head()

Unnamed: 0,draft_num,draft_year,name,report
235,0,2019,Cooper_Moore,Committed to the Uiniversity of North Dakota..
238,0,2019,Eric_Ciccolini,Committed to the University of of Michigan.
240,0,2019,Mike_Koster,His high school years started with him battlin...
241,0,2019,Braden_Doyle,Committed to Boston University.
242,0,2019,Maxence_Guenette,A wide framed defender with soft hands for rec...


In [27]:
train = merged[~mask]
train['NHL'].value_counts()

1    493
0    425
Name: NHL, dtype: int64

# ==============NHL Corpus==============

In [1]:
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
import random

In [2]:
nhl = CategorizedPlaintextCorpusReader(root='../data/NHLcorpus/',fileids=r'.*\.txt',cat_pattern='(\w+)/*')

In [3]:
documents = [(list(nhl.words(fileid)),category)
             for category in nhl.categories()
             for fileid in nhl.fileids(category)]
random.shuffle(documents)

In [4]:
all_words = nltk.FreqDist(w.lower() for w in nhl.words())
all_words.most_common(1000)

[('.', 4755),
 ('and', 3597),
 ('the', 3332),
 (',', 2812),
 ('a', 2410),
 ('to', 2085),
 ('he', 1842),
 ('his', 1762),
 ('in', 1451),
 ('is', 1236),
 ('with', 1058),
 ('-', 1025),
 ('has', 939),
 ('of', 907),
 ('who', 888),
 ('on', 804),
 ('good', 780),
 ('but', 573),
 ('puck', 567),
 ('as', 561),
 ('strong', 455),
 ('an', 414),
 ('at', 410),
 ('for', 402),
 ('that', 401),
 ('game', 400),
 ('player', 391),
 ('him', 369),
 ('can', 364),
 ('be', 347),
 ('will', 334),
 ('plays', 331),
 ('well', 301),
 ('up', 299),
 ('offensive', 297),
 ('play', 295),
 ('all', 289),
 ('big', 279),
 ("'", 268),
 ('excellent', 257),
 ('shot', 248),
 ('very', 248),
 ('needs', 240),
 ('size', 235),
 ('more', 230),
 ('was', 229),
 ('get', 224),
 ('t', 219),
 ('speed', 219),
 ('team', 215),
 ('it', 215),
 ('not', 215),
 ('zone', 207),
 ('when', 205),
 ('ice', 201),
 ('skills', 201),
 ('quick', 198),
 ('work', 194),
 ('defensive', 190),
 ('this', 188),
 ('long', 186),
 ('feet', 182),
 ('one', 181),
 ('hands', 18

In [5]:
word_features = [w[0] for w in all_words.most_common(1000)]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = word in document_words
    return features

In [6]:
document_features(nhl.words(nhl.fileids('false')[86]))

{'contains(.)': True,
 'contains(and)': True,
 'contains(the)': True,
 'contains(,)': True,
 'contains(a)': True,
 'contains(to)': True,
 'contains(he)': True,
 'contains(his)': True,
 'contains(in)': True,
 'contains(is)': True,
 'contains(with)': False,
 'contains(-)': True,
 'contains(has)': True,
 'contains(of)': True,
 'contains(who)': True,
 'contains(on)': False,
 'contains(good)': False,
 'contains(but)': True,
 'contains(puck)': False,
 'contains(as)': False,
 'contains(strong)': False,
 'contains(an)': True,
 'contains(at)': True,
 'contains(for)': True,
 'contains(that)': False,
 'contains(game)': False,
 'contains(player)': True,
 'contains(him)': False,
 'contains(can)': True,
 'contains(be)': False,
 'contains(will)': False,
 'contains(plays)': True,
 'contains(well)': False,
 'contains(up)': False,
 'contains(offensive)': False,
 'contains(play)': False,
 'contains(all)': False,
 'contains(big)': False,
 "contains(')": False,
 'contains(excellent)': False,
 'contains(sho

In [7]:
random.shuffle(documents)
featuresets = [(document_features(d),c) for (d,c) in documents]
train_set,test_set = featuresets[818:],featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.54

In [8]:
classifier.show_most_informative_features(1000)

Most Informative Features
          contains(will) = True             true : false  =      5.8 : 1.0
         contains(front) = True             true : false  =      5.0 : 1.0
        contains(season) = True             true : false  =      5.0 : 1.0
      contains(attacker) = True             true : false  =      5.0 : 1.0
         contains(solid) = True             true : false  =      5.0 : 1.0
          contains(open) = True             true : false  =      4.3 : 1.0
         contains(tools) = True            false : true   =      4.3 : 1.0
       contains(himself) = True            false : true   =      4.3 : 1.0
          contains(have) = True             true : false  =      4.1 : 1.0
        contains(scorer) = True             true : false  =      3.8 : 1.0
         contains(seems) = True             true : false  =      3.7 : 1.0
         contains(doesn) = True             true : false  =      3.7 : 1.0
      contains(progress) = True            false : true   =      3.7 : 1.0

# ==========Filtering Words NHL==========

In [9]:
from nltk.corpus import stopwords
import pickle
import re

In [10]:
# Filter stop words
# Convert to lower case for case insensitive matching
# Remove punctuation
stop_words = set(stopwords.words('english'))

documents_filtered = [([re.sub(r'\W+','',w.lower()) for  w in nhl.words(fileid) if w.lower() not in stop_words],category)
                      for category in nhl.categories()
                      for fileid in nhl.fileids(category)]

In [11]:
words_filtered = nltk.FreqDist(re.sub(r'\W+','',w.lower()) for w in nhl.words() if w.lower() not in stop_words)
words_filtered.most_common(1000)

[('', 9279),
 ('good', 780),
 ('puck', 567),
 ('strong', 455),
 ('game', 400),
 ('player', 391),
 ('plays', 331),
 ('well', 301),
 ('offensive', 297),
 ('play', 295),
 ('big', 279),
 ('excellent', 257),
 ('shot', 248),
 ('needs', 240),
 ('size', 235),
 ('get', 224),
 ('speed', 219),
 ('team', 215),
 ('zone', 207),
 ('ice', 201),
 ('skills', 201),
 ('quick', 198),
 ('work', 194),
 ('defensive', 190),
 ('long', 186),
 ('feet', 182),
 ('one', 181),
 ('hands', 180),
 ('skating', 178),
 ('defender', 176),
 ('junior', 176),
 ('ability', 175),
 ('may', 171),
 ('high', 171),
 ('end', 164),
 ('displays', 162),
 ('great', 161),
 ('wing', 157),
 ('attack', 157),
 ('stick', 152),
 ('hard', 147),
 ('world', 147),
 ('prospect', 146),
 ('way', 139),
 ('top', 135),
 ('two', 133),
 ('solid', 131),
 ('point', 131),
 ('first', 130),
 ('time', 130),
 ('term', 129),
 ('vision', 128),
 ('upside', 128),
 ('strength', 121),
 ('power', 120),
 ('make', 119),
 ('wall', 119),
 ('man', 118),
 ('nhl', 117),
 ('year

In [12]:
word_features = [w[0] for w in words_filtered.most_common(1000)]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = word in document_words
    return features

In [13]:
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
classifier_filtered = nltk.NaiveBayesClassifier.train(train_filtered)
nltk.classify.accuracy(classifier_filtered,test_filtered)

0.58

In [14]:
classifier_filtered.show_most_informative_features(100)

Most Informative Features
           contains(get) = True             true : false  =      6.0 : 1.0
          contains(high) = True             true : false  =      5.2 : 1.0
         contains(shows) = True             true : false  =      5.2 : 1.0
          contains(like) = True             true : false  =      4.7 : 1.0
     contains(defenders) = True             true : false  =      4.7 : 1.0
        contains(upside) = True             true : false  =      4.3 : 1.0
          contains(much) = True             true : false  =      3.9 : 1.0
          contains(jump) = True             true : false  =      3.9 : 1.0
         contains(stick) = True             true : false  =      3.8 : 1.0
         contains(hands) = True             true : false  =      3.8 : 1.0
         contains(named) = True            false : true   =      3.8 : 1.0
       contains(package) = True            false : true   =      3.8 : 1.0
    contains(tremendous) = True            false : true   =      3.8 : 1.0

In [15]:
# Overall most important features
nb_filtered = nltk.NaiveBayesClassifier.train(featuresets_filtered)
nb_filtered.show_most_informative_features(1000)

Most Informative Features
       contains(juniors) = True             true : false  =      6.6 : 1.0
       contains(without) = True             true : false  =      6.0 : 1.0
       contains(dynamic) = True             true : false  =      6.0 : 1.0
      contains(dominant) = True             true : false  =      5.7 : 1.0
           contains(say) = True             true : false  =      5.5 : 1.0
        contains(reacts) = True             true : false  =      5.5 : 1.0
          contains(turn) = True             true : false  =      5.5 : 1.0
        contains(finish) = True             true : false  =      5.5 : 1.0
          contains(pace) = True             true : false  =      5.1 : 1.0
         contains(panic) = True             true : false  =      4.9 : 1.0
       contains(options) = True             true : false  =      4.9 : 1.0
         contains(rough) = True             true : false  =      4.9 : 1.0
    contains(defensemen) = True             true : false  =      4.3 : 1.0

In [16]:
nltk.classify.accuracy(nb_filtered,featuresets_filtered)

0.7145969498910676

In [48]:
save_classifier = open('..\classifiers\\nltk_nb_filter.pickle','wb')
pickle.dump(nb_filtered,save_classifier)
save_classifier.close()

# ==========Sklearn Classifiers==========

In [20]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.svm import LinearSVC

In [30]:
SVC_classifier = SklearnClassifier(LinearSVC())
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
SVC_classifier.train(train_filtered)
nltk.classify.accuracy(SVC_classifier,featuresets_filtered)

0.5904139433551199

In [31]:
MN_classifier = SklearnClassifier(MultinomialNB())
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
MN_classifier.train(train_filtered)
nltk.classify.accuracy(MN_classifier,featuresets_filtered)

0.6176470588235294

In [32]:
BN_classifier = SklearnClassifier(BernoulliNB())
random.shuffle(documents_filtered)
featuresets_filtered = [(document_features(d),c) for (d,c) in documents_filtered]
train_filtered,test_filtered = featuresets_filtered[818:],featuresets_filtered[:100]
BN_classifier.train(train_filtered)
nltk.classify.accuracy(BN_classifier,featuresets_filtered)

0.5958605664488017