In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss,accuracy_score
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as  plt
from sklearn import tree
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
%matplotlib inline

MENS = 'mens'
WOMENS = 'womens'

In [2]:
init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv("train-1542197608821.csv")
test_data = pd.read_csv("test-1542197608821.csv")

In [None]:
data.describe()

In [None]:
test_data.describe()

###### Data Types of the column

In [None]:
data.dtypes

In [None]:
test_data.dtypes

 * There are 0 NA's in the column

In [None]:
#data.info()
data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
data.head()

In [None]:
test_data.head()

#outcome column not present in test_data

In [4]:
# Columns in Data With Categorical Values- Must LabelEncode them
categorical_cols = ['hitpoint', 'outside.sideline', 'outside.baseline', 'same.side', 
                    'previous.hitpoint', 'server.is.impact.player','outcome','gender']
print(categorical_cols)

['hitpoint', 'outside.sideline', 'outside.baseline', 'same.side', 'previous.hitpoint', 'server.is.impact.player', 'outcome', 'gender']


In [5]:
# Columns in the Data That Should Be Scaled
scaled_data = ['serve','rally', 'speed', 'net.clearance', 'distance.from.sideline', 'depth', 'player.distance.travelled', 
               'player.impact.depth', 'player.impact.distance.from.center', 'player.depth', 
               'player.distance.from.center', 'previous.speed', 'previous.net.clearance', 
               'previous.distance.from.sideline', 'previous.depth', 'opponent.depth', 
               'opponent.distance.from.center', 'previous.time.to.net']
print(scaled_data)

['serve', 'rally', 'speed', 'net.clearance', 'distance.from.sideline', 'depth', 'player.distance.travelled', 'player.impact.depth', 'player.impact.distance.from.center', 'player.depth', 'player.distance.from.center', 'previous.speed', 'previous.net.clearance', 'previous.distance.from.sideline', 'previous.depth', 'opponent.depth', 'opponent.distance.from.center', 'previous.time.to.net']


#### Implementing One hot Encoder on tain and test data

In [6]:
def encode(train, test):
    # Retain All LabelEncoder as a dictionary
    d = defaultdict(LabelEncoder)
    print("\n D:",d)

    # Encode all the columns
    train[categorical_cols] = train[categorical_cols].apply(lambda x: d[x.name].fit_transform(x))
    print("\ntrain[categorical_cols]:\n",train[categorical_cols])
#     test[categorical_cols] = train[categorical_cols].apply(lambda x: d[x.name].fit_transform(x))
    test_ids = test['ID']
    # Inverse the encoding
    # data.apply(lambda x: d[x.name].inverse_transform(x))
    
    # Using dictionary d to label future data
    temp = deepcopy(categorical_cols)
    print("\ntemp_Outcome:",temp)
    temp.remove('outcome')
    e = deepcopy(d)
    del e['outcome']
    print("\n E:",e)
#     for key in e.keys():
#         print(key)
#         print(e[key].classes_)
#     print(temp)
    
    test[temp] = test[temp].apply(lambda x: e[x.name].transform(x))
    print("\n test[temp]:",test[temp])
    # #     print(d['hitpoint'].classes_)
    train = train.drop(['ID'], axis=1)
    test = test.drop(['ID'], axis=1)
    
    #print("\n train:{}, test:{}, test_ids:{}, d:{}", train, test, test_ids, d)
    return train, test, test_ids, d

#outcome = data['outcome']
#print("\nOutcome:",outcome)
data, test_data, test_ids , d = encode(data, test_data)


 D: defaultdict(<class 'sklearn.preprocessing.label.LabelEncoder'>, {})

train[categorical_cols]:
       hitpoint  outside.sideline  outside.baseline  same.side  \
0            0                 0                 0          1   
1            0                 0                 1          0   
2            0                 0                 0          0   
3            1                 1                 0          1   
4            0                 0                 0          0   
5            0                 1                 0          1   
6            0                 0                 1          0   
7            1                 0                 0          0   
8            1                 0                 1          0   
9            1                 0                 0          1   
10           1                 1                 1          0   
11           0                 0                 1          0   
12           0                 1                 0     

In [7]:
np.random.seed(0)
#X_train,X_val,y_train,y_val = train_test_split(data test_size=0.3)
#print(X_train.shape, X_val.shape,y_train.shape,y_val.shape)
train_X, val_X = train_test_split(data, test_size=0.2)
X_train = train_X.loc[:, train_X.columns != 'outcome']
y_train = train_X['outcome']
X_val = val_X.loc[:, val_X.columns != 'outcome']
y_val = val_X['outcome']

In [73]:
classifiers = [
    KNeighborsClassifier(7),
    SVC(kernel="poly", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_val)
    acc = accuracy_score(y_val, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_val)
    ll = log_loss(y_val, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 69.8042%
Log Loss: 2.0148601913129367
SVC
****Results****
Accuracy: 84.2982%
Log Loss: 0.443930107407962
DecisionTreeClassifier
****Results****
Accuracy: 80.1749%
Log Loss: 6.847337594326316
RandomForestClassifier
****Results****
Accuracy: 83.9234%
Log Loss: 0.9162891174631701
AdaBoostClassifier
****Results****
Accuracy: 83.4236%
Log Loss: 1.0481995504516184
GradientBoostingClassifier
****Results****
Accuracy: 86.8805%
Log Loss: 0.344588862558353
GaussianNB
****Results****
Accuracy: 74.5106%
Log Loss: 1.3165871232414625


##### Gradient Boosting 

In [8]:
gb_model_m = GradientBoostingClassifier(n_jobs=4,n_estimators=150, learning_rate=0.1, max_depth=4, min_samples_leaf=6)  #with n_estimators set to default Accuracy: 87.6952% Log Loss: 0.33349268494880946
#Grid search result (learning_rate = 0.1, max_depth= 6, min_samples_leaf= 20, n_estimators= 200)
gb_model_m.fit(X_train, y_train.ravel())
val_prob_prediction = gb_model_m.predict_proba(X_val)
val_prediction = gb_model_m.predict(X_val)
acc = accuracy_score(y_val, val_prediction)
print("Accuracy: {:.4%}".format(acc))
ll = log_loss(y_val, val_prob_prediction)
print("Log Loss: {}".format(ll))

Accuracy: 87.4453%
Log Loss: 0.3558452983366544


##### SVM Model 

In [190]:
svc_model = SVC(kernel="poly", C=0.01, probability=True)
svc_model.fit(X_train,y_train)
svc_prob_prediction = svc_model.predict_proba(X_val)
svc_val_prediction = svc_model.predict(X_val)
svc_acc = accuracy_score(y_val, svc_val_prediction)
print("Accuracy: {:.4%}".format(svc_acc))
svc_loss = log_loss(y_val, svc_prob_prediction)
print("Log Loss: {}".format(svc_loss))

Accuracy: 84.6731%
Log Loss: 0.4284354495592927


##### XGBOOST 

In [116]:
xgbc_model = XGBClassifier(n_estimators=250, learning_rate=0.1, max_depth=7, min_samples_leaf=20)
xgbc_model.fit(X_train, y_train)
xgbc_prob_pred = xgbc_model.predict_proba(X_val)
xgbc_pred = xgbc_model.predict(X_val)
xgbc_loss = log_loss(y_val, xgbc_prob_pred)
xgbc_acc = accuracy_score(y_val, xgbc_pred)
print("Accuracy: {:.4%}".format(xgbc_acc))
print("Log Loss: {}".format(xgbc_loss))

Accuracy: 87.5885%
Log Loss: 0.3738272755654149



The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



In [176]:
rfclf_model = RandomForestClassifier(n_jobs=-1,n_estimators=500, random_state=0) 
rfclf_model.fit(X_train, y_train)
rfclf_prob_pred = rfclf_model.predict_proba(X_val)
rfclf_pred = rfclf_model.predict(X_val)
rfclf_loss = log_loss(y_val, rfclf_prob_pred)
rfclf_acc = accuracy_score(y_val, rfclf_pred)
print("Accuracy: {:.4%}".format(rfclf_acc))
print("Log Loss: {}".format(rfclf_loss))

Accuracy: 86.7139%
Log Loss: 0.3778797520867165


* Random Forrest test data predictions 

In [179]:
rfclf_prob_prediction = rfclf_model.predict_proba(test_data)
rfclf_val_prediction = rfclf_model.predict(test_data)
print(rfclf_val_prediction)

[2 1 0 ... 0 1 2]


In [180]:
print(rfclf_val_prediction.size)

1999


In [181]:
rfclf_test_val = rfclf_val_prediction
outcome = []
#test_val_prediction = test_val_prediction.astype(np.char)
for i in range(0,len(rfclf_val_prediction)):
    if rfclf_val_prediction[i] == 0:
        outcome.append('FE')
    elif rfclf_val_prediction[i] == 1:
        outcome.append('UE')
    else :
        outcome.append('W')
print(outcome)

['W', 'UE', 'FE', 'W', 'UE', 'FE', 'UE', 'FE', 'W', 'UE', 'UE', 'W', 'W', 'W', 'UE', 'UE', 'UE', 'W', 'UE', 'FE', 'W', 'UE', 'UE', 'W', 'FE', 'W', 'W', 'W', 'UE', 'W', 'W', 'W', 'W', 'FE', 'UE', 'FE', 'UE', 'W', 'W', 'W', 'FE', 'FE', 'UE', 'UE', 'FE', 'W', 'W', 'W', 'W', 'W', 'UE', 'UE', 'W', 'UE', 'W', 'W', 'UE', 'UE', 'UE', 'W', 'W', 'W', 'W', 'W', 'W', 'FE', 'W', 'UE', 'W', 'W', 'W', 'FE', 'FE', 'UE', 'FE', 'UE', 'UE', 'FE', 'W', 'W', 'UE', 'W', 'UE', 'W', 'UE', 'FE', 'W', 'W', 'UE', 'FE', 'W', 'FE', 'FE', 'UE', 'UE', 'W', 'UE', 'W', 'W', 'UE', 'UE', 'UE', 'UE', 'W', 'FE', 'FE', 'FE', 'UE', 'FE', 'UE', 'W', 'FE', 'W', 'UE', 'W', 'UE', 'W', 'UE', 'W', 'FE', 'FE', 'UE', 'W', 'W', 'FE', 'W', 'UE', 'W', 'W', 'W', 'W', 'UE', 'UE', 'W', 'FE', 'W', 'UE', 'UE', 'W', 'W', 'W', 'FE', 'UE', 'FE', 'W', 'W', 'FE', 'FE', 'UE', 'W', 'UE', 'UE', 'UE', 'UE', 'FE', 'UE', 'W', 'UE', 'UE', 'UE', 'W', 'W', 'UE', 'UE', 'UE', 'FE', 'UE', 'FE', 'UE', 'FE', 'FE', 'FE', 'FE', 'FE', 'W', 'UE', 'W', 'W', 'W', 

In [144]:
test_prob_prediction = gb_model_m.predict_proba(test_data)
test_val_prediction = gb_model_m.predict(test_data)
print(test_val_prediction)

[2 1 0 ... 1 1 2]


In [32]:
print(test_ids)

0       1921
1        486
2       5177
3       4632
4       2735
5       6843
6       7756
7       3053
8       9605
9       5662
10       930
11       313
12      6070
13      6207
14      7468
15      7814
16      1039
17      8476
18      5931
19      5280
20      4575
21      6082
22      6234
23      3410
24      6918
25      5034
26      3379
27      2425
28      1240
29      8575
        ... 
1969    5668
1970     885
1971    2821
1972    3853
1973    8946
1974    4293
1975    8786
1976    9204
1977    5174
1978    9580
1979    5073
1980    1638
1981    5074
1982    8677
1983    3248
1984    1650
1985    9945
1986    4711
1987    6561
1988    3945
1989    4017
1990    8969
1991     681
1992    2023
1993    9321
1994    9081
1995    1281
1996    1169
1997    2374
1998     311
Name: ID, Length: 1999, dtype: int64


In [145]:
test_val = test_val_prediction
outcome = []
#test_val_prediction = test_val_prediction.astype(np.char)
for i in range(0,len(test_val_prediction)):
    if test_val_prediction[i] == 0:
        outcome.append('FE')
    elif test_val_prediction[i] == 1:
        outcome.append('UE')
    else :
        outcome.append('W')
print(outcome)

['W', 'UE', 'FE', 'W', 'UE', 'FE', 'UE', 'FE', 'W', 'UE', 'UE', 'W', 'W', 'W', 'UE', 'UE', 'UE', 'W', 'UE', 'FE', 'W', 'UE', 'UE', 'W', 'FE', 'W', 'W', 'W', 'UE', 'W', 'W', 'W', 'W', 'FE', 'UE', 'FE', 'UE', 'W', 'W', 'W', 'FE', 'FE', 'UE', 'UE', 'FE', 'W', 'W', 'W', 'W', 'W', 'UE', 'UE', 'W', 'UE', 'W', 'W', 'UE', 'UE', 'UE', 'W', 'W', 'W', 'W', 'W', 'W', 'FE', 'W', 'UE', 'W', 'W', 'W', 'FE', 'FE', 'UE', 'FE', 'UE', 'UE', 'FE', 'W', 'W', 'UE', 'W', 'UE', 'W', 'UE', 'FE', 'W', 'W', 'UE', 'FE', 'W', 'FE', 'FE', 'UE', 'UE', 'W', 'UE', 'W', 'W', 'UE', 'UE', 'UE', 'UE', 'W', 'FE', 'FE', 'FE', 'UE', 'FE', 'UE', 'W', 'FE', 'W', 'UE', 'W', 'UE', 'W', 'UE', 'W', 'FE', 'FE', 'UE', 'W', 'W', 'FE', 'W', 'UE', 'W', 'W', 'W', 'W', 'UE', 'UE', 'W', 'FE', 'W', 'UE', 'UE', 'W', 'W', 'W', 'FE', 'UE', 'FE', 'W', 'W', 'FE', 'FE', 'UE', 'W', 'UE', 'UE', 'UE', 'UE', 'FE', 'UE', 'W', 'UE', 'UE', 'UE', 'W', 'W', 'UE', 'UE', 'UE', 'FE', 'UE', 'FE', 'UE', 'FE', 'FE', 'FE', 'FE', 'FE', 'W', 'UE', 'W', 'W', 'W', 

In [182]:
submission = pd.DataFrame(outcome, index=test_ids)
submission.columns = ['outcome']
print(submission)
submission.to_csv("final_submission_6.csv")

     outcome
ID          
1921       W
486       UE
5177      FE
4632       W
2735      UE
6843      FE
7756      UE
3053      FE
9605       W
5662      UE
930       UE
313        W
6070       W
6207       W
7468      UE
7814      UE
1039      UE
8476       W
5931      UE
5280      FE
4575       W
6082      UE
6234      UE
3410       W
6918      FE
5034       W
3379       W
2425       W
1240      UE
8575       W
...      ...
5668       W
885       UE
2821      UE
3853      UE
8946       W
4293      UE
8786      UE
9204      FE
5174       W
9580      UE
5073      UE
1638       W
5074      UE
8677      UE
3248      UE
1650      FE
9945       W
4711       W
6561      UE
3945      UE
4017       W
8969      UE
681       UE
2023      FE
9321      UE
9081       W
1281      UE
1169      UE
2374      UE
311        W

[1999 rows x 1 columns]
