In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, average_precision_score, \
precision_recall_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
master_df = pd.read_csv('core/master.csv')
awards_df = pd.read_csv('core/AwardsPlayers.csv')
batting_df = pd.read_csv('core/Batting.csv')
field_df = pd.read_csv('core/Fielding.csv')
allstar_df = pd.read_csv('core/AllStarFull.csv')
salary_df = pd.read_csv('core/Salaries.csv')

In [None]:
gg_df = awards_df[awards_df.awardID=='Gold Glove']

In [None]:
field_df[['POS']].head()

In [None]:
batting_df[['2B']].head()

In [None]:
#stats_df = pd.concat([batting_df, field_df], axis=1)#join_axes=['yearID', 'playerID'])
stats_df = pd.merge(batting_df, field_df, on=['playerID', 'yearID'])

In [None]:
#stats_df[['2B', 'POS']].head()
stats_df.head()

In [None]:
stats_df['yearID']

In [None]:
stats_df = stats_df[stats_df['yearID']>1956]

In [None]:
stats_df.head()

In [None]:
gg_df.head()

In [None]:
allstar_df.head()

In [None]:
salary_df.head()

In [None]:
# create a dictionary that gives the salary corresponding year and player
combi_salary = {}
for year, player, salary in zip(salary_df['yearID'], salary_df['playerID'], salary_df['salary']):
    combi_salary[(year, player)] = salary

In [None]:
def get_salary(row):

    try:
        return combi_salary[(row['yearID'], row['playerID'])]
    except KeyError:
        return np.nan

stats_df['salary'] = stats_df.apply(get_salary, axis = 1)

In [None]:
stats_df.sample(10)

In [None]:
# create a dictionary that gives the all start position corresponding year and player
combi_allstar = {}
for year, player, pos in zip(allstar_df['yearID'], allstar_df['playerID'], allstar_df['startingPos']):
    if pos == 7 or pos == 8 or pos == 9:
        combi_allstar[(year, player)] = 1
    else:    
        combi_allstar[(year, player)] = 0

In [None]:
def get_allstar(row):

    try:
        return combi_allstar[(row['yearID'], row['playerID'])]
    except KeyError:
        return 0

In [None]:
stats_df['allstar'] = stats_df.apply(get_allstar, axis = 1)

In [None]:
# create a dictionary that gives the position corresponding year and player
combi = {}
for year, player, notes in zip(gg_df['yearID'], gg_df['playerID'], gg_df['notes']):
    if notes == 'LF' or notes == 'RF' or notes == 'CF':
        combi[(year, player)] = 'OF'
    else:    
        combi[(year, player)] = notes

In [None]:
def get_position(row):

    try:
        return combi[(row['yearID'], row['playerID'])]
    except KeyError:
        return 'false'


In [None]:
stats_df['GG'] = stats_df.apply(get_position, axis = 1)

In [None]:
def get_previous(row):

    try:
        i = 1
        num = 0
        while combi.get((row['yearID']-i, row['playerID']), False):
            pos = combi[(row['yearID']-i, row['playerID'])]
            if pos == 'LF' or pos == 'RF' or pos == 'CF' or pos == 'OF':
                num += 1
                
            i += 1
            
        return num            
    
    except KeyError:
        return 0

In [None]:
stats_df['previous'] = stats_df.apply(get_previous, axis = 1)


In [None]:
stats_df.previous.value_counts()

In [None]:
stats_df.POS.value_counts()

In [None]:
stats_df[['2B', '3B', 'POS']].sample(100)

In [None]:
stats_wona_df = stats_df.apply(lambda x : x.fillna(x.mean()) if x.dtype.kind in 'biufc' else x.fillna('.'))

In [None]:
stats_wona_df.head()

In [None]:
stats_wona_df.columns.values

In [None]:
stats_wona_df.head()

In [None]:
stats_OF_df = stats_wona_df[stats_wona_df['POS']=='OF']
stats_OF_df.loc[stats_OF_df['GG']!='OF', 'GG'] = 0
stats_OF_df.loc[stats_OF_df['GG']=='OF', 'GG'] = 1

In [None]:
stats_OF_df.head()
stats_OF_df.columns.values

In [None]:
stats_pre_OF_df = stats_OF_df.drop(['lgID_x', 'lgID_y', 'playerID', 'teamID_x', 'teamID_y'], axis=1)
#stats_pre_OF_df = pd.get_dummies(stats_pre_OF_df, columns=['POS'])

In [None]:
stats_pre_OF_df.head()

In [None]:
X = stats_pre_OF_df.drop(['GG', 'POS'], axis=1)
y = list(stats_pre_OF_df['GG'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
print(confusion_matrix(y_test, predicted))
predicted_train = model.predict(X_train)
print(confusion_matrix(y_train, predicted_train))
target_names = (['0','1'])
print(classification_report(y_test,
                            predicted,
                            target_names=target_names))
print(accuracy_score(y_test, predicted))

In [None]:
rf_model = RandomForestClassifier()

tuned_parameters = [{'n_estimators': [50,100,300], 'min_samples_leaf': [1, 5, 20],
                     'max_features': ['auto', 'sqrt']}]

clf = GridSearchCV(rf_model, tuned_parameters)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print(confusion_matrix(y_test, predicted))
predicted_train = clf.predict(X_train)
print(confusion_matrix(y_train, predicted_train))
target_names = (['0','1'])
print(classification_report(y_test,
                            predicted,
                            target_names=target_names))
print(accuracy_score(y_test, predicted))
print(clf.best_estimator_)

In [None]:
gb_model = GradientBoostingClassifier()

tuned_parameters2 = [{'n_estimators': [50,100,300], 'min_samples_leaf': [1, 5, 20],
                     'max_features': ['auto', 'log2']}]

clf2 = GridSearchCV(gb_model, tuned_parameters2)
clf2.fit(X_train, y_train)
predicted = clf2.predict(X_test)
print(confusion_matrix(y_test, predicted))
predicted_train = clf2.predict(X_train)
print(confusion_matrix(y_train, predicted_train))
target_names = (['0','1'])
print(classification_report(y_test,
                            predicted,
                            target_names=target_names))
print(accuracy_score(y_test, predicted))
print(clf2.best_estimator_)

In [None]:
y_pred_probs = clf2.predict_proba(X_test)
print('Some example probabilities:', y_pred_probs[:10])

# y_pred_probs contains probabilities for both '1' and '0'. we only
# care about the probabilities of '1', so we extract it below.
high_index = clf2.best_estimator_.classes_.tolist().index(1) # this '1' corresponds to what you want to return
y_high_probs = [y_pred_probs[i, high_index] for i in range(y_pred_probs.shape[0])]

print("Just the '1' probabilities:", y_high_probs[:10])

In [None]:
average_precision = average_precision_score (y_test, y_high_probs)
precisions, recalls, _ = precision_recall_curve(y_test, y_high_probs)
#print(precision_recall_curve(y_test, y_high_probs))
print('Average precision is', average_precision)
plt.plot(recalls, precisions)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
features_importances = list(zip(X.columns, model.feature_importances_))
features_importances.sort(key=lambda x:x[1], reverse=True)
features_importances[:10]

In [None]:
features_importances = list(zip(X.columns, clf.best_estimator_.feature_importances_))
features_importances.sort(key=lambda x:x[1], reverse=True)
features_importances[:10]

In [None]:
features_importances = list(zip(X.columns, clf2.best_estimator_.feature_importances_))
features_importances.sort(key=lambda x:x[1], reverse=True)
features_importances[:10]

boxplots  

barplots  
features heatmap  
zone rating with year

how to capture(visualize) features which distinguish between winner and loser  
clustering algorithm  
threshold

In [None]:
sns.set()

# Plot tip as a function of toal bill across days
g = sns.lmplot(x="PO", y="InnOuts", hue="GG",
               truncate=True, size=5, data=stats_pre_OF_df)

# Use more informative axis labels than are provided by default
g.set_axis_labels("PO", "InnOuts")

sns.plt.show()

correlation  
factorplot kind=count  
countplot

In [None]:
corr_df = X.corr()
corr_df
X.head(10)

In [None]:
sns.set()

# Draw a heatmap
my_plot = sns.heatmap(corr_df)
my_plot.get_figure().savefig('heatmap.png')
sns.plt.show()

In [None]:
g = sns.factorplot(x='previous', hue="GG",
                  data=stats_pre_OF_df,
                  kind="count")
sns.plt.show()