In [59]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from __future__ import print_function
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

In [87]:
train = pd.read_csv('C:\Pedro\workspace\Acc-challenge\data.csv')
train = train.sample(frac=1).reset_index(drop=True)

timezone_mapping = dict(zip(sorted(train['Timezone'].unique()), range(0, len(sorted(train['Timezone'].unique())) + 1)))
fixture_mapping = dict(zip(sorted(train['FixtureDate'].unique()), range(0, len(sorted(train['FixtureDate'].unique())) + 1)))
hteam_mapping = dict(zip(sorted(train['HomeTeam'].unique()), range(0, len(sorted(train['HomeTeam'].unique())) + 1)))
ateam_mapping = dict(zip(sorted(train['AwayTeam'].unique()), range(0, len(sorted(train['AwayTeam'].unique())) + 1)))
team_mapping = dict(zip(sorted(train['Team'].unique()), range(0, len(sorted(train['Team'].unique())) + 1)))
atype_mapping = dict(zip(sorted(train['ActionTypeDesc'].unique()), range(0, len(sorted(train['ActionTypeDesc'].unique())) + 1)))
res_mapping = dict(zip(sorted(train['ActionResultDesc'].unique()), range(0, len(sorted(train['ActionResultDesc'].unique())) + 1)))
start_mapping = dict(zip(sorted(train['StartingPositionDesc'].unique()), range(0, len(sorted(train['StartingPositionDesc'].unique())) + 1)))
end_mapping = dict(zip(sorted(train['EndPositionDesc'].unique()), range(0, len(sorted(train['EndPositionDesc'].unique())) + 1)))

train['Timezone_Val'] = train['Timezone'].map(timezone_mapping).astype(int)
train['Fixture_Val'] = train['FixtureDate'].map(fixture_mapping).astype(int)
train['Hteam_Val'] = train['HomeTeam'].map(hteam_mapping).astype(int)
train['Ateam_Val'] = train['AwayTeam'].map(ateam_mapping).astype(int)
train['Team_Val'] = train['Team'].map(team_mapping).astype(int)
train['Atype_Val'] = train['ActionTypeDesc'].map(atype_mapping).astype(int)
train['Start_Val'] = train['StartingPositionDesc'].map(start_mapping).astype(int)
train['End_Val'] = train['EndPositionDesc'].map(end_mapping).astype(int)
train['Res_Val'] = train['ActionResultDesc'].map(res_mapping).astype(int)

train_test = train.drop(['LineOutAttackers','FixtureDate','StadiumName','KickOffTime_GMT','Timezone','KickOffTime_Local','HomeTeam',
                        'AwayTeam','Team','ActionTypeDesc','ActionResultDesc','StartingPositionDesc',
                        'EndPositionDesc','PlayDirection','City','Icon','Summary','ZoneDescription','CloudCover','Pressure'],
                        axis=1)

#train_features = train_test.values[:, 1:48]
#train_features = train_test.iloc[:,[10,11,17,12,35,34,6,8,5,9]].values
train_features = train_test.iloc[:,[10,11,15,34,6]].values
train_target = train_test['Res_Val'].values[:]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, 
                        {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

In [4]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'kernel': 'linear', 'C': 1}

Grid scores on development set:

0.383 (+/-0.004) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.535 (+/-0.403) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.383 (+/-0.004) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.467 (+/-0.159) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.383 (+/-0.004) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.467 (+/-0.159) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.383 (+/-0.004) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.467 (+/-0.159) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
1.000 (+/-0.000) for {'kernel': 'linear', 'C': 1}
1.000 (+/-0.000) for {'kernel': 'linear', 'C': 10}
1.000 (+/-0.000) for {'kernel': 'linear', 'C': 100}
1.000 (+/-0.000) for {'kernel': 'linear', 'C': 1000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    r

In [88]:
np.random.seed(0)
X, y = train_features, train_target
indices = np.arange(y.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y[indices]

In [6]:
param_range = np.logspace(-6, -1, 15)
train_scores, test_scores = validation_curve(SVC(), X, y, param_name="gamma", param_range=param_range,
                                             cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

print("Train scores \n\n %s \n" % train_scores)
print("Valid scores \n\n %s" % test_scores)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

Train scores 

 [[ 0.74789916  0.74789916  0.74789916  0.75490196  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.75210084  0.74789916  0.74929972  0.75490196  0.74825175  0.74825175
   0.74825175  0.75244755  0.74825175  0.75384615]
 [ 0.76610644  0.76890756  0.76470588  0.76330532  0.74965035  0.75664336
   0.75944056  0.76083916  0.75664336  0.76363636]
 [ 0.78291317  0.77731092  0.78291317  0.78851541  0.77762238  0.78321678
   0.78321678  0.77902098  0.77762238  0.78181818]
 [ 0.81092437  0.83333333  0.80952381  0.82773109  0.81258741  0.81818182
   0.81538462  0.82377622  0.81118881  0.81118881]
 [ 0.8767507   0.88795518  0.87955182  0.89355742  0.88811189  0.88111888
   0.88811189  0.8951049   0.88111888  0.88391608]
 [ 0.96078431  0.95518207  0.95238095  0.95938375  0.95524476  0.95664336
   0.95664336  0.96223776  0.96083916  0.95664336]
 [ 0.99019608  0.9929972   0.99159664  0.9929972   0.99020979  0.99020979
   0.99020979  0.99300699  0.988811

In [41]:
#Testing cv set with normalized data
scaler = StandardScaler()
scaler.fit(X)
X_normalized = scaler.transform(X)
#X_test = scaler.transform(X)  # apply same transformation to test data

param_range = np.logspace(-6, -1, 15)
train_scores, test_scores = validation_curve(SVC(C=1), X_normalized, y, param_name="gamma", param_range=param_range,
                                             cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

print("Train scores \n\n %s \n" % train_scores)
print("Valid scores \n\n %s" % test_scores)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

Train scores 

 [[ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.74825175  0.74825175]
 [ 0.74789916  0.74789916  0.74789916  0.74789916  0.74825175  0.74825175
   0.74825175  0.74825175  0.748251

In [23]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

estimator = SVC(C=1, kernel='linear')
plot_learning_curve(estimator, title, X_normalized, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (SVC, Linear kernel, $\gamma=0.0041$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = SVC(C=1, kernel='linear')
plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

plt.show()

In [89]:
test = pd.read_csv('C:\Pedro\workspace\Acc-challenge\\test_data.csv')
test = test.sample(frac=1).reset_index(drop=True)

t_timezone_mapping = dict(zip(sorted(test['Timezone'].unique()), range(0, len(sorted(test['Timezone'].unique())) + 1)))
t_fixture_mapping = dict(zip(sorted(test['FixtureDate'].unique()), range(0, len(sorted(test['FixtureDate'].unique())) + 1)))
t_hteam_mapping = dict(zip(sorted(test['HomeTeam'].unique()), range(0, len(sorted(test['HomeTeam'].unique())) + 1)))
t_ateam_mapping = dict(zip(sorted(test['AwayTeam'].unique()), range(0, len(sorted(test['AwayTeam'].unique())) + 1)))
t_team_mapping = dict(zip(sorted(test['Team'].unique()), range(0, len(sorted(test['Team'].unique())) + 1)))
t_atype_mapping = dict(zip(sorted(test['ActionTypeDesc'].unique()), range(0, len(sorted(test['ActionTypeDesc'].unique())) + 1)))
t_res_mapping = dict(zip(sorted(test['ActionResultDesc'].unique()), range(0, len(sorted(test['ActionResultDesc'].unique())) + 1)))
t_start_mapping = dict(zip(sorted(test['StartingPositionDesc'].unique()), range(0, len(sorted(test['StartingPositionDesc'].unique())) + 1)))
t_end_mapping = dict(zip(sorted(test['EndPositionDesc'].unique()), range(0, len(sorted(test['EndPositionDesc'].unique())) + 1)))

test['Timezone_Val'] = test['Timezone'].map(t_timezone_mapping).astype(int)
test['Fixture_Val'] = test['FixtureDate'].map(t_fixture_mapping).astype(int)
test['Hteam_Val'] = test['HomeTeam'].map(t_hteam_mapping).astype(int)
test['Ateam_Val'] = test['AwayTeam'].map(t_ateam_mapping).astype(int)
test['Team_Val'] = test['Team'].map(t_team_mapping).astype(int)
test['Atype_Val'] = test['ActionTypeDesc'].map(t_atype_mapping).astype(int)
test['Start_Val'] = test['StartingPositionDesc'].map(t_start_mapping).astype(int)
test['End_Val'] = test['EndPositionDesc'].map(t_end_mapping).astype(int)
test['Res_Val'] = test['ActionResultDesc'].map(t_res_mapping).astype(int)

test_set = test.drop(['FixtureDate','StadiumName','KickOffTime_GMT','Timezone','KickOffTime_Local','HomeTeam',
                        'AwayTeam','Team','ActionTypeDesc','ActionResultDesc','StartingPositionDesc','EndPositionDesc',
                        'PlayDirection','City','Icon','Summary','ZoneDescription','CloudCover','Pressure','Visibility'], axis=1)

#test_features = test_set.iloc[:,[10,11,17,12,35,34,6,8,5,9]].values
test_features = test_set.iloc[:,[10,11,15,34,6]].values
#test_features = test_set.values[:,1:48]
#X_test = scaler.transform(test_features)  # apply same transformation to test data

In [90]:
clf = RandomForestClassifier(max_features=2, n_estimators=100)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [91]:
pred_y = clf.predict(test_features)
pred_y

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [92]:
result = pd.DataFrame({'RecordID': test_set['RecordID']})
result['ActionName'] = pred_y.T
result
result.to_excel('PedroCastanha.xlsx', index=False)