# Modeling and Evaluation

This module is mainly responsible for classification and evaluation of the results during which you will see some of the little tricks like grid-search and cross-validation just to make sure the results are valid and awesome.

In [2]:
%load_ext autoreload
%autoreload 2

import warnings
import pymongo
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.utils import shuffle
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier

from hyperparameters import params as all_params

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
warnings.filterwarnings('ignore')

## Data Preparation
Retrieve the data from the database and split it as input and output of the model. Notice that the fields with the id type are useless now. Other than that, we need to hide the information from the future we're gonna predict from the eyes of our model. Finally, let's pick a portion of the data right here right now and dedicate it to the final testing of our model.

In [4]:
client = pymongo.MongoClient()
db = client['soccerdb']

#Load everything into a dataFrame
data = pd.DataFrame(list(db.results.find({}, {'_id': 0, 'teamId': 0, 'matchId': 0})))

In [11]:
targets = [
    {
        'name': 'didScoreInSecondHalf',
        'type': 'categorical'
    }, 
    {
        'name': 'finalScore',
        'type': 'numerical'
    },
    {
        'name': 'isWinner',
        'type': 'categorical'
    },
    {
        'name': 'numSecondHalfGoals',
        'type': 'numerical'
    }, 
]

def get_input_output(data, target_col, hidden_info):
    y = data[target_col]
    X = data.drop(hidden_info + [target_col], axis=1)
    return X, y

print('Which target do you have in mind?\n')
target_names = [x['name'] for x in targets]
for i, itm in enumerate(target_names):
    print('{}. {}'.format(str(i), itm.title()))
    
target_num = int(input('>>>'))
desired_target = target_names[target_num]
operation_type = targets[target_num]['type']

X, y = get_input_output(data, desired_target, target_names)
X = StandardScaler().fit_transform(X)

Which target do you have in mind?

0. Didscoreinsecondhalf
1. Finalscore
2. Iswinner
3. Numsecondhalfgoals


>>> 3


In [12]:
X, y = shuffle(X, y, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1)

In [13]:
def train_and_validate(algorithm, params, reporting_scores,
                       fold=5,):
    algorithms = {
        'svc': SVC,
        'svr': SVR,
        'dtc': DecisionTreeClassifier,
        'dtr': DecisionTreeRegressor,
        'mlp': MLPClassifier,
    }
    
    scores = {
        'f1': f1_score,
        'accuracy': accuracy_score,
    }
    
    score_fns = []
    for score in reporting_scores:
        score_fns.append(make_scorer(scores[score]))  # Mapping the str to corresponding function
    
    results = {'algorithm': algorithm}
    clf = GridSearchCV(algorithms[algorithm](), params, cv=fold,
                       scoring=score_fns[0])
    clf.fit(X_train, y_train)

    results.update(clf.best_params_)

    for i, score in enumerate(reporting_scores):   
        results[score] = score_fns[i](clf, X_test, y_test)
    return results


In [14]:
metrics = ['accuracy', 'f1']

all_results = []
for alg in all_params:
    if all_params[alg]['type'] != operation_type:
        # Don't do classification when the target is regression and 
        # vice versa...
        continue
    
    params = all_params[alg]['params']
    all_results.append(train_and_validate(alg, params, metrics))
    
df = pd.DataFrame(all_results)

In [15]:
df