## IMPORTING LIBRARIES

In [597]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## IMPORTING DATA

In [598]:
train_data = pd.read_csv('./train_data/Train.csv', delimiter=',')
test_data = pd.read_csv('./test_data/Test.csv', delimiter=',')
print(train_data.shape)

(2508, 10)


In [599]:
train_data.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch,MatchWinner
0,5,4,37,4,Home,Away,Second,First,Dec,4
1,1,14,84,7,Neutral,Neutral,First,Second,Sep,1
2,9,15,47,9,Home,Away,First,Second,Feb,9
3,7,2,102,6,Home,Away,First,Second,Aug,2
4,6,8,46,5,Home,Away,First,Second,Aug,6


In [600]:
test_data.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch
0,2,4,34,1,Home,Away,First,Second,Oct
1,14,1,19,15,Home,Away,First,Second,Mar
2,9,10,130,14,Neutral,Neutral,Second,First,Dec
3,9,10,8,9,Home,Away,First,Second,Dec
4,5,15,130,14,Neutral,Neutral,First,Second,Oct


In [624]:
train_data.groupby("HostCountry")["HostCountry"].count()

HostCountry
0     407
1     132
2      23
3     267
4     275
5      37
6      50
7       5
8      25
9     220
10    128
11     21
12    218
13    180
14    189
15    196
16    135
Name: HostCountry, dtype: int64

## EDA

In [602]:
print(train_data.loc[(train_data.MatchWinner == train_data.Team1) & (train_data.Team1_Venue == 'Home')].count())
print(train_data.loc[(train_data.Team1_Innings == "First") & (train_data.MatchWinner == train_data.Team1)].count())
print(train_data[(train_data.Team1_Innings == "First")].count())
print(train_data[(train_data.Team2_Innings == "First")].count())
print(train_data.loc[(train_data.Team1_Innings == "First") & (train_data.MatchWinner == train_data.Team1)].count())
print(train_data.loc[(train_data.Team1_Innings == "Second") & (train_data.MatchWinner == train_data.Team1)].count())
print(train_data.loc[(train_data.Team2_Innings == "Second") & (train_data.MatchWinner == train_data.Team2)].count())
print(train_data.groupby(["Stadium"])["MatchWinner"].count().sort_values(ascending=False))
print(train_data.groupby(["Stadium","Team1"])["MatchWinner"].count().sort_values(ascending=False))
print(train_data.groupby(["MonthOfMatch", "Team1"])['MatchWinner'].count().sort_values(ascending=False))
train_data.MonthOfMatch.nunique()
train_data.Team2.unique()

Team1            1253
Team2            1253
Stadium          1253
HostCountry      1253
Team1_Venue      1253
Team2_Venue      1253
Team1_Innings    1253
Team2_Innings    1253
MonthOfMatch     1253
MatchWinner      1253
dtype: int64
Team1            1255
Team2            1255
Stadium          1255
HostCountry      1255
Team1_Venue      1255
Team2_Venue      1255
Team1_Innings    1255
Team2_Innings    1255
MonthOfMatch     1255
MatchWinner      1255
dtype: int64
Team1            697
Team2            697
Stadium          697
HostCountry      697
Team1_Venue      697
Team2_Venue      697
Team1_Innings    697
Team2_Innings    697
MonthOfMatch     697
MatchWinner      697
dtype: int64
Team1            737
Team2            737
Stadium          737
HostCountry      737
Team1_Venue      737
Team2_Venue      737
Team1_Innings    737
Team2_Innings    737
MonthOfMatch     737
MatchWinner      737
dtype: int64
Team1            556
Team2            556
Stadium          556
HostCountry      556
Team

array([ 4, 14, 15,  2,  8, 13, 12, 10,  5,  1,  9,  6,  7,  0,  3, 11])

In [603]:
train_data.groupby("Team1_Venue")["Team1_Venue"].count()

Team1_Venue
Away          1
Home       1797
Neutral     710
Name: Team1_Venue, dtype: int64

In [604]:
train_data["Team2_Venue"].unique()

array(['Away', 'Neutral', 'Home'], dtype=object)

## FEATURE EXTRACTION

- Preparing new feaures based on venue to make it a binary variable.

In [605]:
venue = {"Home":1, "Neutral":2, "Away":3}
innings = {"Second":2, "First":1}
month = {"Jan":1, "Feb":2, "Mar":3, "Apr":4, "May":5, "Jun":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12}
train_data["Team1_Venue"] = train_data["Team1_Venue"].map(venue)
train_data["Team2_Venue"] = train_data["Team2_Venue"].map(venue)
train_data["Team2_Innings"] = train_data["Team2_Innings"].map(innings)
train_data["Team1_Innings"] = train_data["Team1_Innings"].map(innings)
train_data["MonthOfMatch"] = train_data["MonthOfMatch"].map(month)
train_data.head()
venue = {"Home":1, "Neutral":2, "Away":3}
innings = {"Second":2, "First":1}
month = {"Jan":1, "Feb":2, "Mar":3, "Apr":4, "May":5, "Jun":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12}
test_data["Team1_Venue"] = test_data["Team1_Venue"].map(venue)
test_data["Team2_Venue"] = test_data["Team2_Venue"].map(venue)
test_data["Team2_Innings"] = test_data["Team2_Innings"].map(innings)
test_data["Team1_Innings"] = test_data["Team1_Innings"].map(innings)
test_data["MonthOfMatch"] = test_data["MonthOfMatch"].map(month)

In [606]:
train_data["Team1_Home"] = 0
train_data["Team1_Away"] = 0
train_data["Team1_Neutral"] = 0
train_data["Team2_Home"] = 0
train_data["Team2_Away"] = 0
train_data["Team2_Neutral"] = 0
venue = ["Team1_Home", "Team1_Neutral", "Team1_Away", "Team2_Home", "Team2_Neutral", "Team2_Away"]
for i in range(0, 3):
    train_data.loc[train_data.Team1_Venue == i+1, venue[i]] = 1
    train_data.loc[train_data.Team2_Venue == i+4, venue[i+3]] = 1

In [607]:
test_data["Team1_Home"] = 0
test_data["Team1_Away"] = 0
test_data["Team1_Neutral"] = 0
test_data["Team2_Home"] = 0
test_data["Team2_Away"] = 0
test_data["Team2_Neutral"] = 0
venue = ["Team1_Home", "Team1_Neutral", "Team1_Away", "Team2_Home", "Team2_Neutral", "Team2_Away"]
for i in range(0, 3):
    test_data.loc[test_data.Team1_Venue == i+1, venue[i]] = 1
    test_data.loc[test_data.Team2_Venue == i+4, venue[i+3]] = 1

In [609]:
train_data.loc[train_data.Team1_Home == 1].count()

Team1            1797
Team2            1797
Stadium          1797
HostCountry      1797
Team1_Venue      1797
Team2_Venue      1797
Team1_Innings    1797
Team2_Innings    1797
MonthOfMatch     1797
MatchWinner      1797
Team1_Home       1797
Team1_Away       1797
Team1_Neutral    1797
Team2_Home       1797
Team2_Away       1797
Team2_Neutral    1797
dtype: int64

In [625]:
features_to_use = ["Team1", "Team2", "Team1_Innings", "Team2_Innings"]
features_to_use += venue

In [626]:
features_to_use

['Team1',
 'Team2',
 'Team1_Innings',
 'Team2_Innings',
 'Team1_Home',
 'Team1_Neutral',
 'Team1_Away',
 'Team2_Home',
 'Team2_Neutral',
 'Team2_Away']

In [627]:
X = train_data[features_to_use]
y = train_data['MatchWinner']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.70, random_state=42)

In [628]:
test_data = test_data[features_to_use]
test_data.head()

Unnamed: 0,Team1,Team2,Team1_Innings,Team2_Innings,Team1_Home,Team1_Neutral,Team1_Away,Team2_Home,Team2_Neutral,Team2_Away
0,2,4,1,2,1,0,0,0,0,0
1,14,1,1,2,1,0,0,0,0,0
2,9,10,2,1,0,1,0,0,0,0
3,9,10,1,2,1,0,0,0,0,0
4,5,15,1,2,0,1,0,0,0,0


## TRAINING

## RANDOM FOREST CLASSIFIER

In [629]:
rf = RandomForestClassifier(n_estimators=500, max_depth=15, min_samples_split=5,criterion='gini')
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
score2 = rf.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score)) 
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.785
Test set accuracy:  0.568


## LOGISTIC REGRESSION 

In [630]:
model = LogisticRegression()
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
score2 = model.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score)) 
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.267
Test set accuracy:  0.238


In [631]:
pred = rf.predict_proba(test_data)

In [632]:
# pred = rf.predict_proba(X_test)
predictions = pd.DataFrame(pred)

In [633]:
predictions.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.000333,0.039373,0.272529,0.0,0.074633,0.576291,0.0028,0.000857,0.012705,0.017179,0.0033,0.0,0.0,0.0,0.0,0.0
1,0.001986,0.535176,0.019957,0.0,0.042729,0.003705,0.000571,0.0,0.0,0.0,0.0,0.0,0.01931,0.062466,0.312967,0.001133
2,0.0,0.0,0.0,0.0,0.0,0.026742,0.0,0.038393,0.025855,0.079731,0.632041,0.001186,0.158113,0.037939,0.0,0.0
3,0.000444,0.002071,0.000333,0.0,0.049326,0.030445,0.02851,0.032024,0.001667,0.551508,0.074485,0.0,0.202798,0.025489,0.0009,0.0
4,0.027649,0.031624,0.0,0.0,0.0,0.462243,0.0,0.0,0.0,0.067276,0.065237,0.0,0.0008,0.029445,0.019029,0.296696


## OUTPUT FILE

In [634]:
if not os.path.exists('scores'):
    os.makedirs('scores')
predictions.to_excel('./scores/scores.xlsx')