# Modeling and Evaluation

This module is mainly responsible for classification and evaluation of the results during which you will see some of the little tricks like grid-search and cross-validation just to make sure the results are valid and awesome.

In [1]:
import pymongo
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, make_scorer, accuracy_score, f1_score
from sklearn.utils import shuffle
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from hyperparameters import params as all_params

## Data Preparation
Retrieve the data from the database and split it as input and output of the model. Notice that the fields with the id type are useless now. Other than that, we need to hide the information from the future we're gonna predict from the eyes of our model. Finally, let's pick a portion of the data right here right now and dedicate it to the final testing of our model.

In [8]:
client = pymongo.MongoClient()
db = client['soccerdb']

#Load everything into a dataFrame
data = pd.DataFrame(list(db.results.find({}, {'_id': 0, 'teamId': 0, 'matchId': 0})))
data.head()

Unnamed: 0,didScoreInSecondHalf,finalScore,isHome,isWinner,meanPlayerOverall,meanPlayerPotential,meanPrevScore,meanPrevScoreET,meanPrevScoreHT,meanPrevScoreP,...,numOffside,numOthersOnTheBall,numPass,numRedCard,numSaveAttempt,numSecondHalfGoals,numSecondYellowCard,numShot,numYellowCard,rateAccuratePass
0,1,3,True,1,58.217391,59.478261,1.0,0.0,0.666667,0.0,...,0,14,294,0,1,1,0,13,0,0.880952
1,1,2,True,1,71.086957,72.217391,0.0,0.0,0.0,0.0,...,2,35,265,0,1,2,0,8,0,0.849057
2,0,1,False,1,38.173913,38.913043,1.333333,0.0,0.666667,0.0,...,1,29,178,0,0,0,0,1,0,0.814607
3,0,0,True,0,38.173913,38.913043,1.0,0.0,1.0,0.0,...,4,13,286,0,0,0,0,12,2,0.846154
4,0,0,True,0,30.608696,31.304348,0.0,0.0,0.0,0.0,...,1,21,172,0,3,0,1,4,1,0.831395


In [3]:
targets = ['didScoreInSecondHalf', 'finalScore', 'isWinner', 'numSecondHalfGoals']

def get_input_output(data, target_col, hidden_info):
    y = data[target_col]
    X = data.drop(hidden_info + [target_col], axis=1)
    return X, y

X, y = get_input_output(data, 'isWinner', targets)

In [4]:
X, y = shuffle(X, y, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1)

In [5]:
def train_and_validate(algorithm, params, reporting_scores,
                       fold=5,):
    algorithms = {
        'svc': SVC,
        'svr': SVR,
        'dtc': DecisionTreeClassifier,
        'dtr': DecisionTreeRegressor,
    }
    
    scores = {
        'f1': f1_score,
        'accuracy': accuracy_score,
    }
    
    score_fns = []
    for score in reporting_scores:
        score_fns.append(make_scorer(scores[score]))  # Mapping the str to corresponding function
    
    results = {'algorithm': algorithm}
    clf = GridSearchCV(algorithms[algorithm](), params, cv=fold,
                       scoring=score_fns[0])
    clf.fit(X_train, y_train)

    results.update(clf.best_params_)

    for i, score in enumerate(reporting_scores):   
        results[score] = score_fns[i](clf, X_test, y_test)
    return results


In [6]:
metrics = ['accuracy', 'f1']

all_results = []
for alg, params in all_params.items():
    all_results.append(train_and_validate(alg, params, metrics))
    
df = pd.DataFrame(all_results)



In [7]:
df

Unnamed: 0,C,accuracy,algorithm,f1,gamma,kernel
0,1,0.545455,svc,0.0,0.0001,rbf
