In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from __future__ import print_function
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

In [28]:
train = pd.read_csv('C:\Pedro\workspace\Acc-challenge\data_v2.csv')
#train = train.sample(frac=1).reset_index(drop=True)

pname_mapping = dict(zip(sorted(train['PlayerName'].unique()), range(0, len(sorted(train['PlayerName'].unique())) + 1)))
aname_mapping = dict(zip(sorted(train['ActionName'].unique()), range(0, len(sorted(train['ActionName'].unique())) + 1)))
spos_mapping = dict(zip(sorted(train['StartingPositionDesc'].unique()), range(0, len(sorted(train['StartingPositionDesc'].unique())) + 1)))
epos_mapping = dict(zip(sorted(train['EndPositionDesc'].unique()), range(0, len(sorted(train['EndPositionDesc'].unique())) + 1)))
summary_mapping = dict(zip(sorted(train['Summary'].unique()), range(0, len(sorted(train['Summary'].unique())) + 1)))

train['PlayerName_Val'] = train['PlayerName'].map(pname_mapping).astype(int)
train['ActionName_Val'] = train['ActionName'].map(aname_mapping).astype(int)
train['StartingPositionDesc_Val'] = train['StartingPositionDesc'].map(spos_mapping).astype(int)
train['EndPositionDesc_Val'] = train['EndPositionDesc'].map(epos_mapping).astype(int)
train['Summary_Val'] = train['Summary'].map(summary_mapping).astype(int)

np.random.seed(0)
train = train.reindex(np.random.permutation(train.index)) #Shuffling

X_target = train['ActionName_Val'].values
#X_train = train.drop(['ActionName_Val','ActionName','PlayerName','StartingPositionDesc',
#                      'EndPositionDesc','Summary','RecordID'], axis=1).values


X_train = train.drop(['ActionName_Val','ActionName','PlayerName','StartingPositionDesc',
                      'EndPositionDesc','Summary','RecordID','Summary_Val','EndPositionDesc_Val','End_Y'], axis=1)

#train_features = train_test.values[:, 1:48]
#train_features = train_test.iloc[:,[10,11,17,12,35,34,6,8,5,9]].values
#train_features = train_test.iloc[:,[10,11,15,34,6]].values

In [22]:
#X_train, X_test, y_train, y_test = train_test_split(X_train, X_target, test_size=0.5, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_train, X_target, test_size=0.3, random_state=0)

#scaler = StandardScaler()
#scaler.fit(X_train)
#X_normalized = scaler.transform(X_train)

# Set the parameters by cross-validation
tuned_parameters = [{'max_features': [2,4,8,10,12,13], 'n_estimators': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']

In [23]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='%s_macro' % score)
    clf.fit(X_train, y_train)
    #clf.fit(X_normalized, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'max_features': 8, 'n_estimators': 10}

Grid scores on development set:

0.585 (+/-0.032) for {'max_features': 2, 'n_estimators': 1}
0.569 (+/-0.101) for {'max_features': 2, 'n_estimators': 10}
0.633 (+/-0.048) for {'max_features': 2, 'n_estimators': 100}
0.626 (+/-0.068) for {'max_features': 2, 'n_estimators': 1000}
0.561 (+/-0.042) for {'max_features': 4, 'n_estimators': 1}
0.593 (+/-0.051) for {'max_features': 4, 'n_estimators': 10}
0.610 (+/-0.086) for {'max_features': 4, 'n_estimators': 100}
0.623 (+/-0.078) for {'max_features': 4, 'n_estimators': 1000}
0.559 (+/-0.122) for {'max_features': 8, 'n_estimators': 1}
0.649 (+/-0.113) for {'max_features': 8, 'n_estimators': 10}
0.605 (+/-0.112) for {'max_features': 8, 'n_estimators': 100}
0.610 (+/-0.081) for {'max_features': 8, 'n_estimators': 1000}
0.576 (+/-0.077) for {'max_features': 10, 'n_estimators': 1}
0.589 (+/-0.122) for {'max_features': 1

In [12]:
np.random.seed(0)
X, y = X_train, X_target
indices = np.arange(y.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y[indices]

In [30]:
#param_range = np.logspace(-6, -1, 15)
#param_range = [1,2,4,8,12,16]
param_range = [1,2,4,8,12,13]
train_scores, test_scores = validation_curve(RandomForestClassifier(n_estimators=10), X_train, X_target, param_name="max_features", param_range=param_range,
                                             cv=5, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

print("Train scores \n\n %s \n" % train_scores)
print("Valid scores \n\n %s" % test_scores)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

Train scores 

 [[ 0.98740157  0.98740157  0.99212598  0.98582677  0.99056604]
 [ 0.98740157  0.98267717  0.99212598  0.99212598  0.99371069]
 [ 0.98267717  0.98897638  0.99527559  0.98425197  0.99213836]
 [ 0.98897638  0.99055118  0.98267717  0.98267717  0.98584906]
 [ 0.98110236  0.99212598  0.99370079  0.98267717  0.99056604]
 [ 0.98110236  0.98267717  0.98582677  0.99212598  0.98584906]] 

Valid scores 

 [[ 0.77987421  0.6918239   0.69811321  0.64779874  0.70253165]
 [ 0.70440252  0.71069182  0.59119497  0.72955975  0.74050633]
 [ 0.77358491  0.72955975  0.69811321  0.69811321  0.71518987]
 [ 0.70440252  0.71698113  0.71069182  0.66037736  0.75316456]
 [ 0.79245283  0.71069182  0.67295597  0.68553459  0.75316456]
 [ 0.74842767  0.69811321  0.67295597  0.71698113  0.73417722]]


In [26]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

estimator = RandomForestClassifier(n_estimators=10, max_features=10)
plot_learning_curve(estimator, title, X_train, X_target, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (RFC feat = 10, est=10)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = RandomForestClassifier(n_estimators=10, max_features=10)
plot_learning_curve(estimator, title, X_train, X_target, (0.7, 1.01), cv=cv, n_jobs=4)

plt.show()

In [31]:
test = pd.read_csv('C:\Pedro\workspace\Acc-challenge\\test_data_v2.csv')
#test = test.sample(frac=1).reset_index(drop=True)

t_pname_mapping = dict(zip(sorted(test['PlayerName'].unique()), range(0, len(sorted(test['PlayerName'].unique())) + 1)))
t_spos_mapping = dict(zip(sorted(test['StartingPositionDesc'].unique()), range(0, len(sorted(test['StartingPositionDesc'].unique())) + 1)))
t_epos_mapping = dict(zip(sorted(test['EndPositionDesc'].unique()), range(0, len(sorted(test['EndPositionDesc'].unique())) + 1)))
t_summary_mapping = dict(zip(sorted(test['Summary'].unique()), range(0, len(sorted(test['Summary'].unique())) + 1)))

test['PlayerName_Val'] = test['PlayerName'].map(t_pname_mapping).astype(int)
test['StartingPositionDesc_Val'] = test['StartingPositionDesc'].map(t_spos_mapping).astype(int)
test['EndPositionDesc_Val'] = test['EndPositionDesc'].map(t_epos_mapping).astype(int)
test['Summary_Val'] = test['Summary'].map(t_summary_mapping).astype(int)

#Test_predict = test.drop(['PlayerName','StartingPositionDesc','EndPositionDesc','Summary'], axis=1)
Test_predict = test.drop(['PlayerName','StartingPositionDesc','EndPositionDesc','Summary',
                             'Summary_Val','EndPositionDesc_Val','End_Y'], axis=1)

np.random.seed(0)
Test_sorted = Test_predict.reindex(np.random.permutation(Test_predict.index))
X_test = Test_sorted.drop(['RecordID'], axis=1).values

#test_features = test_set.iloc[:,[10,11,17,12,35,34,6,8,5,9]].values
#test_features = test_set.iloc[:,[10,11,15,34,6]].values
#test_features = test_set.values[:,1:48]
#X_test = scaler.transform(test_features)  # apply same transformation to test data

In [33]:
clf = RandomForestClassifier(max_features=10, n_estimators=10)
clf.fit(X_train, X_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [34]:
pred_y = clf.predict(X_test)
pred_y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1])

In [35]:
result = pd.DataFrame({'RecordID': Test_sorted['RecordID']})
result['ActionName'] = pred_y.T
result
result.to_excel('PedroCastanha_v5.xlsx', index=False)

In [4]:
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
forest.fit(X, y)

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()

Feature ranking:
1. feature 1 (0.126497)
2. feature 0 (0.121862)
3. feature 13 (0.081340)
4. feature 12 (0.073890)
5. feature 8 (0.070335)
6. feature 10 (0.068315)
7. feature 6 (0.065590)
8. feature 9 (0.060387)
9. feature 7 (0.057086)
10. feature 4 (0.045432)
11. feature 11 (0.043139)
12. feature 5 (0.042171)
13. feature 2 (0.040435)
14. feature 15 (0.038373)
15. feature 3 (0.037797)
16. feature 14 (0.027351)
