In [19]:
from sklearn import datasets
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.cross_validation import train_test_split
import pandas as pd

#iris = datasets.load_iris()
#X, y = iris.data[:, 1:3], iris.target
df = pd.read_csv("Matchup_KP.csv")
features = ['FieldGoalAvg', 'ThreePointAvg', 'FreeThrowAvg', 'ReboundAvg', 'AssistAvg', 'TurnOverAvg', 
            'StealAvg', 'BlockAvg', 'PersonalFoulAvg', 'Pyth', 'Rank', 'AdjustO', 'AdjustO Rank', 'AdjustD', 
            'AdjustD Rank', 'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank']
features2 = ['Pyth', 'Rank', 'AdjustO', 'AdjustO Rank', 'AdjustD', 
            'AdjustD Rank', 'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank']

Submission = pd.read_csv("2016Submission.csv")
submission_test = Submission[features2]

X = df[features2]
y = df.WinLoss

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
     scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

#Accuracy: 0.90 (+/- 0.05) [Logistic Regression]
#Accuracy: 0.93 (+/- 0.05) [Random Forest]
#Accuracy: 0.91 (+/- 0.04) [naive Bayes]
#Accuracy: 0.95 (+/- 0.05) [Ensemble]

Accuracy: 0.79 (+/- 0.03) [Logistic Regression]
Accuracy: 0.75 (+/- 0.04) [Random Forest]
Accuracy: 0.79 (+/- 0.03) [naive Bayes]
Accuracy: 0.79 (+/- 0.03) [Ensemble]


# Logistic Regression

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)

0.78842676311030746

In [21]:
predictions = clf1.predict_proba(submission_test)
Submission.Pred = predictions[:,1]
LogisticRegressionSubmission = Submission[['Id','Pred']].sort_values(by = 'Id')
LogisticRegressionSubmission.to_csv("LogisticRegressionSubmission2.csv",index = False)

# Random Forest

In [22]:
clf2.fit(X_train, y_train)
clf2.score(X_test, y_test)

0.72694394213381552

In [23]:
predictions = clf2.predict_proba(submission_test)
Submission.Pred = predictions[:,1]
RandomForestSubmission = Submission[['Id','Pred']].sort_values(by = 'Id')
RandomForestSubmission.to_csv("RandomForestSubmission2.csv",index = False)

# Naive Bayes

In [24]:
clf3.fit(X_train, y_train)
clf3.score(X_test, y_test)

0.75587703435804698

In [25]:
predictions = clf3.predict_proba(submission_test)
Submission.Pred = predictions[:,1]
NaiveBayesSubmission = Submission[['Id','Pred']].sort_values(by = 'Id')
NaiveBayesSubmission.to_csv("NaiveBayesSubmission2.csv",index = False)

# Ensemble

In [26]:
eclf.fit(X_train, y_train)
eclf.score(X_test, y_test)

0.759493670886076

In [27]:
predictions = clf3.predict_proba(submission_test)
Submission.Pred = predictions[:,1]
EnsembleSubmission = Submission[['Id','Pred']].sort_values(by = 'Id')
EnsembleSubmission.to_csv("EnsembleSubmission2.csv",index = False)

# Using All Features
#### Logistic Regression Place = 258
#### Random Forest Place = 567
#### Naive Bayes Place = 557
#### Ensemble = 557

# With Only Ken Pom:
#### Logistic Regression = 133
#### Random Forest = 584
#### Naive Bayes = 489
#### Ensemble = 489