In [1]:
import pandas as pd

In [2]:
world_cup_years = ["2014","2010","2006","2002","1998","1994"]

world_cup_df = pd.DataFrame()
for year in world_cup_years:
    df = pd.read_csv("../01 Data Collection/data/final/full_" + year + ".csv")
    world_cup_df = pd.concat([world_cup_df,df], axis=0)
world_cup_df = world_cup_df.reset_index(drop=True)

In [3]:
world_cup_df.columns

Index(['Date', 'Team1', 'Team1_score', 'Team2', 'Team2_score', 'team1_fouls',
       'team2_fouls', 'team1_yellow_cards', 'team2_yellow_cards',
       'team1_red_cards', 'team2_red_cards', 'team1_offsides',
       'team2_offsides', 'team1_won_corners', 'team2_won_corners',
       'team1_saves', 'team2_saves', 'team1_possession', 'team2_possession',
       'team1_shots', 'team2_shots', 'team1_avg_wins', 'team1_avg_draws',
       'team1_avg_gd', 'team2_avg_wins', 'team2_avg_draws', 'team2_avg_gd',
       'team1_lineup', 'team2_lineup', 'year', 'avg_age_t1', 'avg_gpa_t1',
       'avg_apa_t1', 'avg_ycpa_t1', 'total_minutes_t1', 'avg_age_t2',
       'avg_gpa_t2', 'avg_apa_t2', 'avg_ycpa_t2', 'total_minutes_t2'],
      dtype='object')

In [4]:
world_cup_df = world_cup_df.drop(['Date', 'Team1', 'Team2', 'team1_fouls',
       'team2_fouls', 'team1_yellow_cards', 'team2_yellow_cards',
       'team1_red_cards', 'team2_red_cards', 'team1_offsides',
       'team2_offsides', 'team1_won_corners', 'team2_won_corners',
       'team1_saves', 'team2_saves', 'team1_possession', 'team2_possession',
       'team1_shots', 'team2_shots','team1_lineup', 'team2_lineup', "year"], axis = 1)

In [5]:
# find columns that have NAN
world_cup_df.loc[:, world_cup_df.isna().any()].head()

Unnamed: 0,avg_gpa_t1,avg_apa_t1,avg_ycpa_t1,avg_gpa_t2,avg_apa_t2,avg_ycpa_t2
0,0.174733,0.101189,0.188081,0.119909,0.17957,0.171065
1,0.0928,0.065788,0.208215,0.068904,0.017072,0.155783
2,0.117637,0.185847,0.201451,0.164861,0.106784,0.128909
3,0.167072,0.182059,0.23835,0.118354,0.043348,0.164867
4,0.130392,0.118742,0.245432,0.134188,0.119994,0.217186


In [6]:
# safe to fillna across the dataframe with zeros
world_cup_df = world_cup_df.fillna(0)

In [7]:
def get_result(team1_score, team2_score):
    if team1_score > team2_score:
        return 0
    elif team1_score < team2_score:
        return 1
    else:
        return 2

In [8]:
world_cup_df["result"] = world_cup_df.apply(lambda row: get_result(row['Team1_score'], row["Team2_score"]), axis=1)

In [9]:
# world_cup_df["Team1_won"] = (world_cup_df.Team1_score > world_cup_df.Team2_score).apply(int)
# world_cup_df["Team2_won"] = (world_cup_df.Team2_score > world_cup_df.Team1_score).apply(int)
# world_cup_df["Draw"] = (world_cup_df.Team2_score == world_cup_df.Team1_score).apply(int)
# world_cup_df = world_cup_df.drop(["Team1_score", "Team2_score"], axis = 1)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [11]:
world_cup_df.columns

Index(['Team1_score', 'Team2_score', 'team1_avg_wins', 'team1_avg_draws',
       'team1_avg_gd', 'team2_avg_wins', 'team2_avg_draws', 'team2_avg_gd',
       'avg_age_t1', 'avg_gpa_t1', 'avg_apa_t1', 'avg_ycpa_t1',
       'total_minutes_t1', 'avg_age_t2', 'avg_gpa_t2', 'avg_apa_t2',
       'avg_ycpa_t2', 'total_minutes_t2', 'result'],
      dtype='object')

In [12]:
y = world_cup_df.result
X = world_cup_df.loc[:, ['team1_avg_wins', 'team1_avg_draws', 'team1_avg_gd', 'avg_age_t1', 'avg_gpa_t1',
        'avg_apa_t1', 'avg_ycpa_t1', 'total_minutes_t1',
                             'team2_avg_wins', 'team2_avg_draws', 'team2_avg_gd', 'avg_age_t2', 'avg_gpa_t2', 
        'avg_apa_t2', 'avg_ycpa_t2', 'total_minutes_t2']]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
lr = LogisticRegression(multi_class="multinomial", solver="lbfgs")

In [15]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
yhat = lr.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [18]:
accuracy_score(y_test, yhat)

0.5145631067961165

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

In [20]:
clf = GradientBoostingClassifier(n_estimators=10, learning_rate=0.04,max_depth=3, random_state=0).fit(X_train, y_train)

In [21]:
clf.score(X_test, y_test)  

0.49514563106796117

### "Data Augmentation"

In [22]:
# make a copy
world_cup_df2 = world_cup_df.copy()

In [23]:
cols = world_cup_df2.columns.tolist()

In [24]:
print(cols)

['Team1_score', 'Team2_score', 'team1_avg_wins', 'team1_avg_draws', 'team1_avg_gd', 'team2_avg_wins', 'team2_avg_draws', 'team2_avg_gd', 'avg_age_t1', 'avg_gpa_t1', 'avg_apa_t1', 'avg_ycpa_t1', 'total_minutes_t1', 'avg_age_t2', 'avg_gpa_t2', 'avg_apa_t2', 'avg_ycpa_t2', 'total_minutes_t2', 'result']


In [25]:
# rename columns
new_cols = ['Team2_score', 'Team1_score', 'team2_avg_wins', 'team2_avg_draws', 'team2_avg_gd', 'team1_avg_wins', 'team1_avg_draws', 'team1_avg_gd', 'avg_age_t2', 'avg_gpa_t2', 'avg_apa_t2', 'avg_ycpa_t2', 'total_minutes_t2', 'avg_age_t1', 'avg_gpa_t1', 'avg_apa_t1', 'avg_ycpa_t1', 'total_minutes_t1', 'result']

In [26]:
# rearrange columns
world_cup_df2 = world_cup_df2[new_cols]

In [27]:
# rename columns again
world_cup_df2.columns = cols

In [28]:
world_cup_df2["result"] = world_cup_df2.apply(lambda row: get_result(row['Team1_score'], row["Team2_score"]), axis=1)

In [29]:
world_cup_df2.head()

Unnamed: 0,Team1_score,Team2_score,team1_avg_wins,team1_avg_draws,team1_avg_gd,team2_avg_wins,team2_avg_draws,team2_avg_gd,avg_age_t1,avg_gpa_t1,avg_apa_t1,avg_ycpa_t1,total_minutes_t1,avg_age_t2,avg_gpa_t2,avg_apa_t2,avg_ycpa_t2,total_minutes_t2,result
0,1,3,0.6,0.4,0.8,1.0,0.0,3.2,25.888889,0.119909,0.17957,0.171065,26076.0,25.555556,0.174733,0.101189,0.188081,23067.0,1
1,0,1,0.4,0.2,-0.4,0.4,0.2,0.6,25.142857,0.068904,0.017072,0.155783,16241.0,27.5,0.0928,0.065788,0.208215,21688.0,1
2,5,1,0.4,0.4,0.2,0.8,0.0,1.0,25.75,0.164861,0.106784,0.128909,23874.0,26.8,0.117637,0.185847,0.201451,35719.0,0
3,1,3,0.4,0.2,0.4,0.6,0.0,1.0,26.125,0.118354,0.043348,0.164867,18023.0,25.444444,0.167072,0.182059,0.23835,21598.0,1
4,0,3,0.2,0.6,-0.2,0.4,0.6,1.0,26.666667,0.134188,0.119994,0.217186,9285.0,27.25,0.130392,0.118742,0.245432,18565.0,1


In [30]:
# concatenate both dataframes
aug_world_cup_df = pd.concat([world_cup_df,world_cup_df2], axis=0)

### Training with augmented data

In [31]:
X = aug_world_cup_df.loc[:, ['team1_avg_wins', 'team1_avg_draws', 'team1_avg_gd', 'avg_age_t1', 'avg_gpa_t1',
        'avg_apa_t1', 'avg_ycpa_t1', 'total_minutes_t1',
                             'team2_avg_wins', 'team2_avg_draws', 'team2_avg_gd', 'avg_age_t2', 'avg_gpa_t2', 
        'avg_apa_t2', 'avg_ycpa_t2', 'total_minutes_t2']]
y = aug_world_cup_df.result

In [32]:
X_train, X_test_dev, y_train, y_test_dev = train_test_split(X, y, test_size=0.3, random_state=42)

In [33]:
X_dev, X_test, y_dev, y_test = train_test_split(X_test_dev, y_test_dev, test_size=0.5, random_state=42)

In [34]:
lr = LogisticRegression(multi_class="multinomial", solver="lbfgs")

In [35]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
yhat = lr.predict(X_dev)

In [37]:
accuracy_score(y_dev, yhat)

0.45098039215686275

In [38]:
clf = GradientBoostingClassifier(n_estimators=18, learning_rate=0.01,max_depth=3, random_state=0, subsample=0.5).fit(X_train, y_train)

In [39]:
clf.score(X_dev, y_dev)

0.46078431372549017

In [40]:
# gridsearch cv
from sklearn.model_selection import GridSearchCV

In [41]:
parameters = {'n_estimators':[1, 5, 10, 15, 20, 25, 30, 35], 
              'learning_rate':[0.01, 0.02, 0.03,0.04,0.05,0.06], 
              'max_depth':[1,2,3,4]}
gbc = GradientBoostingClassifier()
clf = GridSearchCV(gbc, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 5, 10, 15, 20, 25, 30, 35], 'max_depth': [1, 2, 3, 4], 'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
print(clf.best_params_)

{'learning_rate': 0.04, 'max_depth': 2, 'n_estimators': 25}


In [43]:
X_train_dev = pd.concat([X_train, X_dev], axis = 0)
y_train_dev = pd.concat([y_train, y_dev], axis = 0)

In [44]:
clf_final = GradientBoostingClassifier(n_estimators=25, learning_rate=0.04,max_depth=2, random_state=0).fit(X_train_dev, y_train_dev)

In [45]:
clf_final.score(X_test, y_test)

0.4854368932038835

In [46]:
clf_final = GradientBoostingClassifier(n_estimators=25, learning_rate=0.04,max_depth=2, random_state=0).fit(X, y)

In [47]:
from sklearn.externals import joblib
joblib.dump(clf, 'match_prediction_model.pkl') 
# and later you can load it
# clf = joblib.load('match_prediction_model.pkl')

['match_prediction_model.pkl']