In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./data.csv', index_col=0)

In [None]:
df.shape

In [3]:
testM = df.loc[(df.is_goal.isna()) & (~df.shot_id_number.isna()),:]
train = df.loc[~df.is_goal.isna(),:]

In [None]:
print("Train shape: ", train.shape)
print("Test shape:", test.shape)

In [None]:
train.head()

## Cleaning

In [None]:
train.columns

In [4]:
train = train.drop(['match_event_id','shot_id_number','match_id','team_id','remaining_min.1','remaining_sec.1','knockout_match.1','power_of_shot.1','distance_of_shot.1','team_name'],axis=1)

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
meanImputer = SimpleImputer(np.nan, strategy='mean')
medianImputer = SimpleImputer(np.nan, strategy='median')
frequentImputer = SimpleImputer(np.nan, strategy='most_frequent')

In [7]:
train['is_shot'] = train.type_of_shot.apply(lambda x: 0 if x is np.nan else 1)
train['is_combined_shot'] = train.type_of_combined_shot.apply(lambda x: 0 if x is np.nan else 1)

train.type_of_combined_shot = train.type_of_combined_shot.fillna("000")
train.type_of_shot = train.type_of_shot.fillna("00")

train.type_of_combined_shot = train.type_of_combined_shot.apply(lambda x: int(str(x)[-1]))
train.type_of_shot = train.type_of_shot.apply(lambda x: int(str(x)[-1]))

train['shot_type'] = train.type_of_combined_shot + train.type_of_shot

train['lat'] = train['lat/lng'].apply(lambda x: float(str(x).split(", ")[0]))
train['long'] = train['lat/lng'].apply(lambda x: float(str(x).split(", ")[-1]))

train = train.drop(['type_of_shot','type_of_combined_shot','lat/lng'], axis=1)

In [8]:
medianColumns = ['remaining_min','knockout_match','lat','long', 'power_of_shot']
meanColumns = ['location_x','location_y','remaining_sec','distance_of_shot']
frequentColumns = ['area_of_shot','shot_basics','range_of_shot','date_of_game','home/away','game_season']
print(len(medianColumns + meanColumns + frequentColumns))

15


In [9]:
meanImputer.fit(train.loc[:,meanColumns])
medianImputer.fit(train.loc[:,medianColumns])
frequentImputer.fit(train.loc[:,frequentColumns])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [10]:
trainImputedMean = meanImputer.transform(train.loc[:,meanColumns])
trainImputedMedian = medianImputer.transform(train.loc[:,medianColumns])
trainImputedFrequent = frequentImputer.transform(train.loc[:, frequentColumns])

In [11]:
yTrain = train['is_goal']

In [None]:
train.columns

In [12]:
prefinalTrain = pd.DataFrame(np.concatenate([trainImputedMean, trainImputedMedian, train[['is_shot','is_combined_shot','shot_type']].values], axis=1))

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [40]:
for i in range(5):
    trainImputedFrequent[:,i] = le.fit_transform(trainImputedFrequent[:,i])

finalTrain = pd.concat([prefinalTrain, pd.DataFrame(trainImputedFrequent)], axis=1)

In [41]:
finalTrain.columns = list(range(finalTrain.shape[1]))

In [47]:
finalTrain['year'] = finalTrain[17].apply(lambda x: int(str(x).split("-")[0]))

In [48]:
finalTrain['month'] = finalTrain[17].apply(lambda x: int(str(x).split("-")[-1]))

In [49]:
finalTrain

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,year,month
0,-157.000000,0.00000,22.000000,35.000000,10.0,0.0,45.539131,-122.651648,1.0,1.0,0.0,5.0,2,4,2,308,28,2000-01,2000,1
1,-101.000000,135.00000,45.000000,36.000000,7.0,0.0,45.539131,-122.651648,1.0,1.0,0.0,5.0,1,4,0,308,30,2000-01,2000,1
2,138.000000,175.00000,52.000000,42.000000,6.0,0.0,45.539131,-122.651648,1.0,0.0,1.0,3.0,4,4,0,308,28,2000-01,2000,1
3,0.000000,0.00000,19.000000,20.000000,5.0,0.0,45.539131,-122.651648,2.0,0.0,1.0,1.0,0,0,4,308,28,2000-01,2000,1
4,-145.000000,-11.00000,32.000000,34.000000,9.0,0.0,45.539131,-122.651648,3.0,1.0,0.0,7.0,2,4,2,308,28,2005-06,2005,6
5,0.000000,0.00000,52.000000,20.000000,8.0,0.0,45.539131,-122.651648,3.0,0.0,1.0,4.0,0,0,4,308,28,2000-01,2000,1
6,-65.000000,91.46118,12.000000,32.000000,6.0,0.0,45.539131,-122.651648,3.0,1.0,0.0,6.0,2,1,2,308,28,2000-01,2000,1
7,-33.000000,91.46118,36.000000,32.000000,3.0,0.0,45.539131,-122.651648,3.0,1.0,0.0,4.0,0,1,2,308,28,2000-01,2000,1
8,-94.000000,238.00000,56.000000,45.000000,1.0,0.0,45.539131,-122.651648,3.0,1.0,0.0,7.0,1,5,1,308,30,2000-01,2000,1
9,121.000000,127.00000,0.000000,37.000000,11.0,0.0,42.982923,-71.446094,1.0,0.0,1.0,3.0,4,4,0,309,71,2000-01,2000,1


## Modelling

In [51]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [52]:
X_train, X_test, Y_train, Y_test = train_test_split (finalTrain.drop(17, axis=1), yTrain, test_size = 0.20, random_state=42)


In [53]:
pipelines = []
pipelines.append(('ScaledSVC', Pipeline([('Scaler', StandardScaler()),('SVC', svm.SVC())])))
pipelines.append(('ScaledDT', Pipeline([('Scaler', StandardScaler()),('DT', tree.DecisionTreeClassifier())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', StandardScaler()),('XGB',xgboost.XGBClassifier())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))



results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=5, random_state=21)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='neg_mean_absolute_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledSVC: -0.386685 (0.008372)
ScaledDT: -0.451773 (0.006643)
ScaledXGB: -0.381774 (0.005124)
ScaledRF: -0.413805 (0.006037)


## Hyperparameter Tuning

In [54]:
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = dict(n_estimators=np.array([50,100,200,300,400]), max_depth=np.array([5,9,14]))
model = xgboost.XGBClassifier(random_state=21)
kfold = KFold(n_splits=5, random_state=21)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)

In [55]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

-0.378908 (0.003253) with: {'max_depth': 5, 'n_estimators': 50}
-0.381006 (0.003962) with: {'max_depth': 5, 'n_estimators': 100}
-0.386788 (0.004681) with: {'max_depth': 5, 'n_estimators': 200}
-0.392059 (0.005553) with: {'max_depth': 5, 'n_estimators': 300}
-0.398097 (0.007548) with: {'max_depth': 5, 'n_estimators': 400}
-0.387351 (0.009536) with: {'max_depth': 9, 'n_estimators': 50}
-0.397022 (0.009720) with: {'max_depth': 9, 'n_estimators': 100}
-0.401013 (0.009567) with: {'max_depth': 9, 'n_estimators': 200}
-0.407409 (0.011781) with: {'max_depth': 9, 'n_estimators': 300}
-0.410940 (0.008340) with: {'max_depth': 9, 'n_estimators': 400}
-0.401883 (0.004108) with: {'max_depth': 14, 'n_estimators': 50}
-0.403827 (0.003823) with: {'max_depth': 14, 'n_estimators': 100}
-0.406386 (0.003190) with: {'max_depth': 14, 'n_estimators': 200}
-0.407614 (0.004170) with: {'max_depth': 14, 'n_estimators': 300}
-0.408279 (0.004823) with: {'max_depth': 14, 'n_estimators': 400}
Best: -0.378908 using {

## Making predictions

In [None]:
yTest = testM['is_goal']
test = test.drop(['match_event_id','game_season','shot_id_number','match_id','team_id','remaining_min.1','remaining_sec.1','knockout_match.1','power_of_shot.1','distance_of_shot.1','team_name','is_goal'],axis=1)
test['is_shot'] = test.type_of_shot.apply(lambda x: 0 if x is np.nan else 1)
test['is_combined_shot'] = test.type_of_combined_shot.apply(lambda x: 0 if x is np.nan else 1)

test.type_of_combined_shot = test.type_of_combined_shot.fillna("000")
test.type_of_shot = test.type_of_shot.fillna("00")

test.type_of_combined_shot = test.type_of_combined_shot.apply(lambda x: int(str(x)[-1]))

test.type_of_shot = test.type_of_shot.apply(lambda x: int(str(x)[-1]))

test['shot_type'] = test.type_of_combined_shot + test.type_of_shot

test['lat'] = test['lat/lng'].apply(lambda x: float(str(x).split(", ")[0]))
test['long'] = test['lat/lng'].apply(lambda x: float(str(x).split(", ")[-1]))
test.drop(['lat/lng','type_of_shot','type_of_combined_shot'], axis=1, inplace=True)

In [None]:
testImputedMean = meanImputer.transform(test.loc[:,meanColumns])
testImputedMedian = medianImputer.transform(test.loc[:,medianColumns])
testImputedFrequent = frequentImputer.transform(test.loc[:,frequentColumns])

In [None]:
prefinalTest = pd.DataFrame(np.concatenate([testImputedMean, testImputedMedian, test[['is_shot','is_combined_shot','shot_type']].values], axis=1))

In [None]:
for i in range(5):
    testImputedFrequent[:,i] = le.fit_transform(testImputedFrequent[:,i])

finalTest = pd.concat([prefinalTest, pd.DataFrame(testImputedFrequent)], axis=1)

In [None]:
finalTrain.shape

In [None]:
from sklearn.metrics import mean_absolute_error

scaler = StandardScaler().fit(finalTrain)
rescaled_final = scaler.transform(finalTrain)
model = xgboost.XGBClassifier(random_state=21, n_estimators=50, max_depth=5)
model.fit(rescaled_final, yTrain)

# transform the validation dataset
rescaled_X_test = scaler.transform(finalTest)
predictions = model.predict_proba(rescaled_X_test)[:,1]

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
idx = testM.loc[:,'shot_id_number']

In [None]:
pred = pd.Series(predictions)

In [None]:
pred.index = idx.index

In [None]:
sub = pd.concat([idx, pred], axis=1, ignore_index=True)
sub.columns = ['shot_id_number','is_goal']
sub.shot_id_number = sub.shot_id_number.astype('int64')

In [None]:
sub.to_csv('ayush_nair_082898_prediction_1.csv', index=False)