<a href="https://colab.research.google.com/github/Melvinmcrn/Project_ML_Soccer-world-cup-2018/blob/master/Soccer%20World%20Cup%202018%20Winner%20Melvin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sb
import numpy as np
%matplotlib inline

# **Data**

there are 3 datasets here.<br>
1. **rankings**: FIFA ranking and points for the teams, which is a monthly changing rank previously shown as a decent predictor of team performance<br>
2. **matches**: used to find out how much the difference in point, ranks and the current rank of the team affects the outocme of a match<br>
3. **world_cup**: upcoming matches

In [0]:
rankings = pd.read_csv('https://raw.githubusercontent.com/Melvinmcrn/Project_ML_Soccer-world-cup-2018/master/data/fifa_ranking.csv')
matches = pd.read_csv('https://raw.githubusercontent.com/Melvinmcrn/Project_ML_Soccer-world-cup-2018/master/data/results.csv')
world_cup = pd.read_csv('https://raw.githubusercontent.com/Melvinmcrn/Project_ML_Soccer-world-cup-2018/master/data/World%20Cup%202018%20Dataset.csv')

### **Prepare 'rankings'**

In [0]:
rankings.head()

In [0]:
rankings = rankings.loc[:,['rank', 'country_full', 'country_abrv', 'cur_year_avg_weighted', 'rank_date', 'two_year_ago_weighted', 'three_year_ago_weighted']]
rankings = rankings.replace({"IR Iran": "Iran"})
rankings['weighted_points'] =  rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted'] + rankings['three_year_ago_weighted']
rankings['rank_date'] = pd.to_datetime(rankings['rank_date'])
rankings.head()

### **Prepare 'matches'**

In [0]:
matches.head()

In [0]:
matches =  matches.replace({'Germany DR': 'Germany', 'China': 'China PR'})
matches['date'] = pd.to_datetime(matches['date'])
matches.head()

### **Prepare 'world_cup'**

In [0]:
world_cup.head()

In [0]:
world_cup = world_cup.loc[:, ['Team', 'Group', 'First match \nagainst', 'Second match\n against', 'Third match\n against']]
world_cup = world_cup.dropna(how='all')
world_cup = world_cup.replace({"IRAN": "Iran", 
                               "Costarica": "Costa Rica", 
                               "Porugal": "Portugal", 
                               "Columbia": "Colombia", 
                               "Korea" : "Korea Republic"})
world_cup = world_cup.set_index('Team')
world_cup.head(10)

# **Feature extraction**

In [0]:
# I want to have the ranks for every day 
rankings = rankings.set_index(['rank_date'])\
            .groupby(['country_full'], group_keys=False)\
            .resample('D').first()\
            .fillna(method='ffill')\
            .reset_index()

In [0]:
rankings.head()

In [0]:
# join the ranks
matches_rank = matches.merge(rankings, 
                        left_on=['date', 'home_team'], 
                        right_on=['rank_date', 'country_full'])
matches_rank.head()

In [0]:
matches_rank = matches_rank.merge(rankings, 
                        left_on=['date', 'away_team'], 
                        right_on=['rank_date', 'country_full'], 
                        suffixes=('_home', '_away'))
matches_rank.head()

In [0]:
# feature generation
matches_rank['rank_difference'] = matches_rank['rank_home'] - matches_rank['rank_away']
matches_rank['average_rank'] = (matches_rank['rank_home'] + matches_rank['rank_away'])/2
matches_rank['point_difference'] = matches_rank['weighted_points_home'] - matches_rank['weighted_points_away']
matches_rank['score_difference'] = matches_rank['home_score'] - matches_rank['away_score']
matches_rank['is_won'] = matches_rank['score_difference'] > 0 # take draw as lost
matches_rank['is_stake'] = matches_rank['tournament'] != 'Friendly'
matches_rank.head()

In [0]:
# I tried earlier rest days but it did not turn to be useful
max_rest = 30
matches_rank['rest_days'] = matches_rank.groupby('home_team').diff()['date'].dt.days.clip(0,max_rest).fillna(max_rest)

In [0]:
# I tried earlier the team as well but that did not make a difference either
matches_rank['wc_participant'] = matches_rank['home_team'] * matches_rank['home_team'].isin(world_cup.index.tolist())
matches_rank['wc_participant'] = matches_rank['wc_participant'].replace({'':'Other'})
matches_rank = matches_rank.join(pd.get_dummies(matches_rank['wc_participant']))

In [0]:
matches_rank.head()

In [0]:
matches_rank.info()

# **Visualize**

In [0]:
sb.set(style='darkgrid')
sb.countplot(x='is_stake', data=matches_rank)

In [0]:
sb.countplot(x='is_won', data=matches_rank)

In [0]:
fig,ax0 = plt.subplots(figsize=(20,5))
sb.countplot(x='score_difference', data=matches_rank)

In [0]:
# find outlier
fig,_ = plt.subplots(figsize=(10,5))
sb.boxplot(x=matches_rank['score_difference'])

In [0]:
# find outlier
fig,_ = plt.subplots(figsize=(10,5))
sb.boxplot(x=matches_rank['rank_difference'])

In [0]:
# remove outlier
matches_rank_clean = matches_rank[matches_rank['score_difference'] < 7]
matches_rank_clean = matches_rank_clean[matches_rank_clean['score_difference'] > -6]

fig,_ = plt.subplots(figsize=(10,5))
sb.boxplot(x=matches_rank_clean['score_difference'])

In [0]:
# remove outlier
matches_rank_clean = matches_rank_clean[matches_rank_clean['rank_difference'] < 118]
matches_rank_clean = matches_rank_clean[matches_rank_clean['rank_difference'] > -124]

fig,_ = plt.subplots(figsize=(10,5))
sb.boxplot(x=matches_rank_clean['rank_difference'])

# **Modeling**

binary classifier (only predict 'win' or 'lost')

## **Logistic regression**

In [0]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

### evaluate model

In [0]:
def evaluateModel(model, X_train, X_test, y_train, y_test):
  # figures 
  fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:,1])
  plt.figure(figsize=(15,5))
  ax = plt.subplot(1,3,1)
  ax.plot([0, 1], [0, 1], 'k--')
  ax.plot(fpr, tpr)
  ax.set_title('AUC score is {0:0.2}'.format(roc_auc_score(y_test, model.predict_proba(X_test)[:,1])))
  ax.set_aspect(1)

  ax = plt.subplot(1,3,2)
  cm = confusion_matrix(y_test, model.predict(X_test))
  ax.imshow(cm, cmap='Blues', clim = (0, cm.max())) 

  ax.set_xlabel('Predicted label')
  ax.set_title('Performance on the Test set')

  ax = plt.subplot(1,3,3)
  cm = confusion_matrix(y_train, model.predict(X_train))
  ax.imshow(cm, cmap='Blues', clim = (0, cm.max())) 
  ax.set_xlabel('Predicted label')
  ax.set_title('Performance on the Training set')
  pass

  features = ['average_rank', 'rank_difference', 'point_difference']
  wrongs = y_test != model.predict(X_test)

  for feature in features:
      plt.figure()
      plt.title(feature)
      X_test.loc[wrongs, feature].plot.kde()
      X.loc[:, feature].plot.kde()
      plt.legend(['wrongs', 'all'])
      
  print("Stakes distribution in the wrong predictions")
  print(X_test.loc[wrongs, 'is_stake'].value_counts() / wrongs.sum())
  print("Stakes distribution overall")
  print(X['is_stake'].value_counts() / X.shape[0])

### normal data

In [0]:
X, y = matches_rank.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches_rank['is_won']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

logreg = linear_model.LogisticRegression(C=1e-5)
features = PolynomialFeatures(degree=2)
model = Pipeline([
    ('polynomial_features', features),
    ('logistic_regression', logreg)
])
model = model.fit(X_train, y_train)

evaluateModel(model, X_train, X_test, y_train, y_test)

### cleaned data

In [0]:
X, y = matches_rank_clean.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches_rank_clean['is_won']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

logreg = linear_model.LogisticRegression(C=1e-5)
features = PolynomialFeatures(degree=2)
model = Pipeline([
    ('polynomial_features', features),
    ('logistic_regression', logreg)
])
model = model.fit(X_train, y_train)

evaluateModel(model, X_train, X_test, y_train, y_test)

### normal data but add 'rest_days' to feature

In [0]:
X, y = matches_rank.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake', 'rest_days']], matches_rank['is_won']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

logreg = linear_model.LogisticRegression(C=1e-5)
features = PolynomialFeatures(degree=2)
model = Pipeline([
    ('polynomial_features', features),
    ('logistic_regression', logreg)
])
model = model.fit(X_train, y_train)

evaluateModel(model, X_train, X_test, y_train, y_test)

### cleaned data but add 'rest_days' to feature

In [0]:
X, y = matches_rank_clean.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake', 'rest_days']], matches_rank_clean['is_won']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

logreg = linear_model.LogisticRegression(C=1e-5)
features = PolynomialFeatures(degree=2)
model = Pipeline([
    ('polynomial_features', features),
    ('logistic_regression', logreg)
])
model = model.fit(X_train, y_train)

evaluateModel(model, X_train, X_test, y_train, y_test)

## **Decision Tree**

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report

In [0]:
def DecisionTree(X, y):
    scale = StandardScaler()
    X = scale.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.2, random_state=42)

    param_grid = {'max_depth': np.arange(1, 20),
 'criterion':['entropy','gini']}
  tree = GridSearchCV(DecisionTreeClassifier(), param_grid)
  tree.fit(X_train, y_train)
  print(tree.best_estimator_)

  y_pred = tree.predict(X_test)

  print(accuracy_score(y_test, y_pred))
  print(classification_report(y_test, y_pred))

### normal data

In [0]:
X, y = matches_rank.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches_rank['is_won']
DecisionTree(X, y)

### cleaned data

In [0]:
X, y = matches_rank_clean.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches_rank_clean['is_won']
DecisionTree(X, y)

### normal data with feature 'rest_days'

In [0]:
X, y = matches_rank.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake','rest_days']], matches_rank['is_won']
DecisionTree(X, y)

### cleaned data with feature 'rest_days'

In [0]:
X, y = matches_rank_clean.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake','rest_days']], matches_rank_clean['is_won']
DecisionTree(X, y)

## **SVM**

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [0]:
def SVM(X, y):
  scale = StandardScaler()
  X = scale.fit_transform(X)

  X_train, X_test, y_train, y_test = train_test_split(X, y, 
  test_size=0.2, random_state=42)

  parameters = { 'C':np.arange(1,11,0.5), 'gamma':['auto','scale']}
  svc = svm.SVC()
  SVM=GridSearchCV(svc, parameters)
  SVM.fit(X_train,y_train)
  print(SVM.best_estimator_)

  y_pred = SVM.predict(X_test)

  print(accuracy_score(y_test, y_pred))
  print(classification_report(y_test, y_pred))

### normal data

In [0]:
X, y = matches_rank.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches_rank['is_won']
SVM(X, y)

### cleaned data

In [0]:
X, y = matches_rank_clean.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches_rank_clean['is_won']
SVM(X, y)

### normal data with feature 'rest_days'

In [0]:
X, y = matches_rank.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake','rest_days']], matches_rank['is_won']
SVM(X, y)

### cleaned data with feature 'rest_days'

In [0]:
X, y = matches_rank_clean.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake','rest_days']], matches_rank_clean['is_won']
SVM(X, y)

In [0]:
# **Random Forest**

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

# prediction on test set
y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))