In [1]:
import sqlite3
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn import metrics
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVR
import math
import joblib
import warnings
warnings.filterwarnings('ignore')

def drop_players_columns(match_df):
    match_df = match_df[match_df.columns.drop(list(match_df.filter(regex='player')))]
    return match_df

def keep_certain_bookeepers(match_df):
    # In this function we will keep only the Bet365 and Bwin collumns
    # because they are the most popular bookeepers
    
    col_list = match_df.columns.tolist()
    cols_to_use = col_list[:len(col_list)-24]
    match_df = match_df[cols_to_use]
    return match_df

def create_class_label(match_df):
    #Home Win = 1
    #Home Defeat = -1
    #Draw = 0
    conditions = [(match_df['home_team_goal'] > match_df['away_team_goal']),
                  (match_df['home_team_goal'] < match_df['away_team_goal'])]

    choices = [1, -1]

    match_df['result'] = np.select(conditions, choices, default=0)
    return match_df

def calculate_total_goals(match_df):
    #This function calculates total gols fromthe sum
    #of home and away goals. We do that because currently
    #the goals are in xml format which is not helpful
    
    match_df = match_df.drop(['goal'],axis=1)
    match_df['total_goals'] = match_df['home_team_goal'] + match_df['away_team_goal']
    return match_df

def parse_xml_columns(match_df):
    match_df['home_shoton'] = 0
    match_df['away_shoton'] = 0
    match_df['home_shotoff'] = 0
    match_df['away_shotoff'] = 0
    match_df['home_fouls'] = 0
    match_df['away_fouls'] = 0
    match_df['home_corner'] = 0
    match_df['away_corner'] = 0
    match_df['home_ycard'] = 0
    match_df['away_ycard'] = 0
    match_df['home_rcard'] = 0
    match_df['away_rcard'] = 0
    match_df['home_cross'] = 0
    match_df['away_cross'] = 0
    match_df['home_possession'] = 0.0
    match_df['away_possession'] = 0.0
    
    for index, row in match_df.iterrows():
        home_team = row['home_team_api_id']
        away_team = row['away_team_api_id'] 
        
        root_shoton = ET.fromstring(row['shoton'])
        root_shotoff = ET.fromstring(row['shotoff'])
        root_foulcommit = ET.fromstring(row['foulcommit'])
        root_corner = ET.fromstring(row['corner'])
        root_card = ET.fromstring(row['card'])
        root_cross = ET.fromstring(row['cross'])
        root_possession = ET.fromstring(row['possession'])
        
        for value in root_shoton.findall('value'):
            team = value.find('team')
            if team is None:
                continue
            if (team.text == str(home_team)):
                match_df.at[index,'home_shoton'] = match_df.at[index,'home_shoton']+1
            if (team.text == str(away_team)):
                match_df.at[index,'away_shoton'] = match_df.at[index,'away_shoton']+1
                
        for value in root_shotoff.findall('value'):
            team = value.find('team')
            if team is None:
                continue
            if (team.text == str(home_team)):
                match_df.at[index,'home_shotoff'] = match_df.at[index,'home_shotoff']+1
            if (team.text == str(away_team)):
                match_df.at[index,'away_shotoff'] = match_df.at[index,'away_shotoff']+1
                
        for value in root_foulcommit.findall('value'):
            team = value.find('team')
            if team is None:
                continue
            if (team.text == str(home_team)):
                match_df.at[index,'home_fouls'] = match_df.at[index,'home_fouls']+1
            if (team.text == str(away_team)):
                match_df.at[index,'away_fouls'] = match_df.at[index,'away_fouls']+1
                
        for value in root_corner.findall('value'):
            team = value.find('team')
            if team is None:
                continue
            if (team.text == str(home_team)):
                match_df.at[index,'home_corner'] = match_df.at[index,'home_corner']+1
            if (team.text == str(away_team)):
                match_df.at[index,'away_corner'] = match_df.at[index,'away_corner']+1
                
        if root_card.findall('value'):           
            for value in root_card.findall('value'):
                card_type = value.find('comment').text
                team = value.find('team')
                if team is None:
                    continue
                if (team.text == str(home_team)):
                    if card_type in ['r', 'y2']:
                        match_df.at[index,'home_rcard'] = match_df.at[index,'home_rcard']+1
                    if card_type == 'y':
                        match_df.at[index,'home_ycard'] = match_df.at[index,'home_ycard']+1
                if (team.text == str(away_team)):
                    if card_type in ['r', 'y2']:
                        match_df.at[index,'away_rcard'] = match_df.at[index,'away_rcard']+1
                    if card_type == 'y':
                        match_df.at[index,'away_ycard'] = match_df.at[index,'away_ycard']+1
                        
        for value in root_cross.findall('value'):
            cross_type = value.find('type').text
            team = value.find('team')
            if team is None:
                continue
            if cross_type != 'cross':
                continue
            if (team.text == str(home_team)):
                match_df.at[index,'home_cross'] = match_df.at[index,'home_cross']+1
            if (team.text == str(away_team)):
                match_df.at[index,'away_cross'] = match_df.at[index,'away_cross']+1
        value_list_len = float(len(root_possession.findall('value')))
        if value_list_len != 0:
            home_pos_sum = 0
            away_pos_sum = 0
            for value in root_possession.findall('value'):
                homepos = value.find('homepos')
                awaypos = value.find('awaypos')
                if (homepos is None) or (awaypos is None):
                    value_list_len = value_list_len - 1
                    continue
                home_pos_sum = home_pos_sum + int(homepos.text)
                away_pos_sum = away_pos_sum + int(awaypos.text)
            match_df.at[index,'home_possession'] = round((home_pos_sum/value_list_len),2)
            match_df.at[index,'away_possession'] = round((away_pos_sum/value_list_len),2)
        else:
            match_df.at[index,'home_possession'] = 50
            match_df.at[index,'away_possession'] = 50
    match_df = match_df.drop(['shoton','shotoff','foulcommit','card','cross','corner','possession'], axis = 1)
            
    return match_df

def calculate_ma_ca_elo(match_df,teams_list):
    team_df_dict = dict()
    for item in teams_list:
        team_df = match_df.loc[(match_df['home_team_api_id'] == item) | (match_df['away_team_api_id'] == item)]
        team_df['goals'] = 0
        team_df['shotons'] = 0
        team_df['shotoffs'] = 0
        team_df['fouls'] = 0
        team_df['corners'] = 0
        team_df['ycards'] = 0
        team_df['rcards'] = 0
        team_df['crosses'] = 0
        team_df['possessions'] = 0.0
        team_df['elo_rate'] = 0
        for index, row in team_df.iterrows():
            if row['home_team_api_id'] == item:
                team_df.at[index,'goals'] = row['home_team_goal']
                team_df.at[index,'shotons'] = row['home_shoton']
                team_df.at[index,'shotoffs'] = row['home_shotoff']
                team_df.at[index,'fouls'] = row['home_fouls']
                team_df.at[index,'corners'] = row['home_corner']
                team_df.at[index,'ycards'] = row['home_ycard']
                team_df.at[index,'rcards'] = row['home_rcard']
                team_df.at[index,'crosses'] = row['home_cross']
                team_df.at[index,'possessions'] = row['home_possession']
                team_df.at[index,'elo_rate'] = row['home_elo']
            if row['away_team_api_id'] == item:
                team_df.at[index,'goals'] = row['away_team_goal']
                team_df.at[index,'shotons'] = row['away_shoton']
                team_df.at[index,'shotoffs'] = row['away_shotoff']
                team_df.at[index,'fouls'] = row['away_fouls']
                team_df.at[index,'corners'] = row['away_corner']
                team_df.at[index,'ycards'] = row['away_ycard']
                team_df.at[index,'rcards'] = row['away_rcard']
                team_df.at[index,'crosses'] = row['away_cross']
                team_df.at[index,'possessions'] = row['away_possession']
                team_df.at[index,'elo_rate'] = row['away_elo']
        #print(team_df[['home_team_api_id','home_team_goal','away_team_goal','goals']])
        for col in ['goals','shotons','shotoffs', 'fouls', 'corners','ycards','rcards','crosses','possessions']:
            #prepei na ta shiftarw ola kata ena wste to ma kai ca olwn ton prwhgoumenwn agwnwn na erthei ston shmerino
            team_df['ma_'+col] = team_df[col].rolling(window=5).mean()
            team_df['ca_'+col] = team_df[col].expanding().mean()
            team_df['ma_pow2_'+col] = np.power((team_df['ma_'+col]),2)
            team_df['ca_pow2_'+col] = np.power((team_df['ca_'+col]),2)
            team_df['ma_sqrt_'+col]=team_df['ma_'+col]**(1/2)
            team_df['ca_sqrt_'+col]=team_df['ca_'+col]**(1/2)
            
        team_df_dict.update({str(item): team_df})
        
    return team_df_dict


def shift_ma_ca_elo_columns(team_df_dict):
    ca_ma_col_list = list()
    for team in team_df_dict:
        for col in team_df_dict[team].columns:
            if ('ca_' in col) or ('ma_' in col) or ('elo_rate' in col):
                if col not in ca_ma_col_list:
                    ca_ma_col_list.append(col)
                team_df_dict[team][col] = team_df_dict[team][col].shift(1)
    
    return team_df_dict, ca_ma_col_list

def add_ca_ma_elo_col_to_main_df(team_shifted_dict, match_df,ca_ma_col_names):
    for team in team_shifted_dict:
        for index, team_row in team_shifted_dict[team].iterrows():
            if team == str(team_row['home_team_api_id']):
                for col in ca_ma_col_names:
                    match_df.at[index,'home_'+col] = team_row[col]
            if team == str(team_row['away_team_api_id']):
                for col in ca_ma_col_names:
                    match_df.at[index,'away_'+col] = team_row[col]

    return match_df

def create_unified_columns(match_df, is_home):

    if is_home:
        match_df = match_df.rename(columns={"home_team_goal": "goals"})
        match_df = match_df.drop(columns=['away_team_goal'])
        match_df['homegame'] = 1
        for col in match_df.columns:
            if ("away_ma" in col) or ("away_ca" in col) or ("away_elo_rate" in col) or ("pr_perc_away_win" in col) or ("away_strength" in col):
                new_col = col.replace("away","opponent")
                match_df = match_df.rename(columns={col: new_col})
            if ("home_ma" in col) or ("home_ca" in col) or ("home_elo_rate" in col) or ("pr_perc_home_win" in col) or ("home_strength" in col):
                new_col = col.replace("home", "team")
                match_df = match_df.rename(columns={col: new_col})
    if not is_home:
        match_df = match_df.rename(columns={"away_team_goal": "goals"})
        match_df = match_df.drop(columns=['home_team_goal'])
        match_df['homegame'] = 0
        match_df['goals_diff'] = - match_df['goals_diff']
        for col in match_df.columns:
            if ("home_ma" in col) or ("home_ca" in col) or ("home_elo_rate" in col) or ("pr_perc_home_win" in col) or ("home_strength" in col):
                new_col = col.replace("home","opponent")
                match_df = match_df.rename(columns={col: new_col})
            if ("away_ma" in col) or ("away_ca" in col) or ("away_elo_rate" in col) or ("pr_perc_away_win" in col) or ("away_strength" in col):
                new_col = col.replace("away", "team")
                match_df = match_df.rename(columns={col: new_col})
    return match_df

#### Calculating elo ####

def calculate_elo_rating(elo_df,team_df):
    k=30
    team_elo_dict = dict()
    elo_df['home_elo'] = 0
    elo_df['away_elo'] = 0
    
    temp_df = pd.merge(team_df,elo_df, left_on='team_api_id',right_on = "away_team_api_id", how='inner')
    team_list = temp_df.groupby(['team_api_id']).mean().reset_index()['team_api_id'].tolist()
    
    for team in team_list:
        team_elo_dict[team]=1000
    
    for index, row in elo_df.iterrows():
        home_team = row['home_team_api_id']
        away_team = row['away_team_api_id']
        match_result = row['result']
        home_rating = team_elo_dict[home_team]
        away_rating = team_elo_dict[away_team]
        
        
        prob_home = 1.0 * 1.0 / (1 + 1.0 * math.pow(10, 1.0 * (away_rating - home_rating) / 400))
        prob_away = 1.0 * 1.0 / (1 + 1.0 * math.pow(10, 1.0 * (home_rating - away_rating) / 400))
        
        if match_result == 1:
            home_result=1
            away_result=0
        elif match_result == -1:
            home_result=0
            away_result=1
        else:
            home_result=0.5
            away_result=0.5
        
        home_rating_new = round(home_rating + k*(home_result - prob_home))
        away_rating_new = round(away_rating + k*(away_result - prob_away))
        
        if (home_rating_new < 0 or home_rating_new < 100):
            home_rating_new = 100
        if (away_rating_new < 0 or away_rating_new < 100):
            away_rating_new = 100
        
        elo_df.at[index,'home_elo'] = home_rating_new
        elo_df.at[index,'away_elo'] = away_rating_new
        
        team_elo_dict[home_team]= home_rating_new
        team_elo_dict[away_team]= away_rating_new
        
    return elo_df


def calculate_previous_games_results_goal_diff(pr_games_df, team_df):
    
    pr_games_df['pr_perc_home_win'] = 0.0
    pr_games_df['pr_perc_draw'] = 0.0
    pr_games_df['pr_perc_away_win'] = 0.0
    pr_games_df['goals_diff'] = 0
    temp_df = pd.merge(team_df,pr_games_df, left_on='team_api_id',right_on = "away_team_api_id", how='inner')
    team_list = temp_df.groupby(['team_api_id']).mean().reset_index()['team_api_id'].tolist()
    
    temp_team_list = team_list.copy()
    for team in team_list:
        temp_team_list.remove(team)
        for opponent in temp_team_list:
            temp_df = pr_games_df.loc[((pr_games_df['home_team_api_id'] == team) & (pr_games_df['away_team_api_id'] == opponent))|
                                      ((pr_games_df['home_team_api_id'] == opponent) & (pr_games_df['away_team_api_id'] == team))]
            
            if temp_df.empty:
                continue
            for index, row in temp_df.iterrows():
                keep_win_dict = {team: 0, opponent: 0}
                keep_goals_dict = {team: 0, opponent: 0}
                team_percent = 0.0
                opponent_percent = 0.0
                draw_percent = 0.0
                subset_temp_df = temp_df.loc[:index].copy()
                subset_temp_df= subset_temp_df.drop(index)
                
                if not subset_temp_df.empty:
                    
                    for sub_index, sub_row in subset_temp_df.iloc[-5:].iterrows():
                        
                        if sub_row['result'] == 1:
                            keep_win_dict[sub_row['home_team_api_id']] = keep_win_dict[sub_row['home_team_api_id']] + 1
                        if sub_row['result'] == -1:
                            keep_win_dict[sub_row['away_team_api_id']] = keep_win_dict[sub_row['away_team_api_id']] + 1
                        ### for the goals ###    
                        keep_goals_dict[sub_row['home_team_api_id']] = keep_goals_dict[sub_row['home_team_api_id']] + sub_row['home_team_goal']
                        keep_goals_dict[sub_row['away_team_api_id']] = keep_goals_dict[sub_row['away_team_api_id']] + sub_row['away_team_goal']
                        #####################
                    team_percent = keep_win_dict[team] / len(subset_temp_df.iloc[-5:])
                    opponent_percent = keep_win_dict[opponent] / len(subset_temp_df.iloc[-5:])
                    draw_percent = 1 - (team_percent + opponent_percent)
                    
                    if team == row['home_team_api_id']:
                        pr_games_df.at[index,'pr_perc_home_win'] = team_percent
                        pr_games_df.at[index,'pr_perc_away_win'] = opponent_percent
                        pr_games_df.at[index,'pr_perc_draw'] = draw_percent
                    
                    if team == row['away_team_api_id']:
                        pr_games_df.at[index,'pr_perc_home_win'] = opponent_percent
                        pr_games_df.at[index,'pr_perc_away_win'] = team_percent
                        pr_games_df.at[index,'pr_perc_draw'] = draw_percent
                        
                    #### for the goals ###
                    if team == row['home_team_api_id']:
                        if keep_goals_dict[team] > keep_goals_dict[opponent]:
                            pr_games_df.at[index,'goals_diff'] = keep_goals_dict[team] - keep_goals_dict[opponent]
                        if keep_goals_dict[team] < keep_goals_dict[opponent]:
                            pr_games_df.at[index,'goals_diff'] = keep_goals_dict[team] - keep_goals_dict[opponent]
                            
                    if team == row['away_team_api_id']:
                        if keep_goals_dict[team] > keep_goals_dict[opponent]:
                            pr_games_df.at[index,'goals_diff'] = keep_goals_dict[opponent] - keep_goals_dict[team]
                        if keep_goals_dict[team] < keep_goals_dict[opponent]:
                            pr_games_df.at[index,'goals_diff'] = keep_goals_dict[opponent] - keep_goals_dict[team]
                    
                        
                               
    return pr_games_df

def add_strength_points_to_df(strength_df,joined_df):
    strength_df['home_strength'] = np.nan
    strength_df['away_strength'] = np.nan
    for index, row in strength_df.iterrows():
        home_team = row['home_team_api_id']
        away_team = row['away_team_api_id']
        current_season = row['season']
        
        for i in range(1,11):
            change_current_season_format="until-"+"-".join([str(int(year)-i) for year in current_season.split("/")])

            try:
                home_strength=int(joined_df.loc[(joined_df['team_api_id'] == home_team) & (joined_df['Season'] == change_current_season_format)]['Strength'])
                break
            except:
                home_strength = 0
                continue
        for i in range(1,11):
            change_current_season_format="until-"+"-".join([str(int(year)-i) for year in current_season.split("/")])
            try:
                away_strength=int(joined_df.loc[(joined_df['team_api_id'] == away_team) & (joined_df['Season'] == change_current_season_format)]['Strength'])
                break
            except:
                away_strength = 0
                continue

        strength_df.at[index,'home_strength'] = home_strength
        strength_df.at[index,'away_strength'] = away_strength
    return strength_df
        

In [2]:
# Create the connection
cnx = sqlite3.connect(r'database.sqlite')
# England id = 1729
# Germany id = 7809
match_df_england_raw = pd.read_sql_query("SELECT * FROM Match WHERE country_id == 1729", cnx)



match_df_england = match_df_england_raw.copy()
team_df = pd.read_sql_query("SELECT * FROM Team", cnx)
match_df_england = drop_players_columns(match_df_england)
match_df_england = keep_certain_bookeepers(match_df_england)

match_df_england = match_df_england.dropna()
match_df_england = create_class_label(match_df_england)
match_df_england = calculate_total_goals(match_df_england)
match_df_england = parse_xml_columns(match_df_england)

#### ELO RATING ####
match_df_for_elo = match_df_england.copy()
elo_df_w_ratings = calculate_elo_rating(match_df_for_elo,team_df)
match_df_england = elo_df_w_ratings
#########################################

###### Add previous game results and difference in goals ##############
match_df_for_previous_games = match_df_england.copy()
previous_games_results_df = calculate_previous_games_results_goal_diff(match_df_for_previous_games, team_df)
match_df_england = previous_games_results_df
#######################################################################

##### Strength Team Dataframe Joined #################
strength_df=pd.read_csv(r'strength_pl.csv')

strength_team_join=pd.merge(team_df,strength_df, left_on='team_long_name',right_on = "TeamName", how='inner')
match_df_for_strength = match_df_england.copy()
match_df_england=add_strength_points_to_df(match_df_for_strength,strength_team_join)
#################################################


team_df_england = pd.merge(team_df,match_df_england, left_on='team_api_id',right_on = "away_team_api_id", how='inner')
team_england_list = team_df_england.groupby(['team_api_id']).mean().reset_index()['team_api_id'].tolist()

team_df_dict = calculate_ma_ca_elo(match_df_england,team_england_list)
team_df_dict_shifted, ca_ma_col_names= shift_ma_ca_elo_columns(team_df_dict)


In [3]:
match_df_england_ca_ma = add_ca_ma_elo_col_to_main_df(team_df_dict_shifted, match_df_england,ca_ma_col_names)

In [4]:
match_df_england_final = match_df_england_ca_ma.dropna()


In [5]:
match_df_england_home = match_df_england_final.copy()
match_df_england_away = match_df_england_final.copy()


In [6]:
match_df_england_home = create_unified_columns(match_df_england_home, is_home = True)
match_df_england_away = create_unified_columns(match_df_england_away, is_home = False)


In [7]:
match_df_england_final_concat = pd.concat([match_df_england_home,match_df_england_away],ignore_index=True, sort=False)


In [8]:
match_df_england_train = match_df_england_final_concat.loc[(match_df_england_final_concat['season'] != '2014/2015') & (match_df_england_final_concat['season'] != '2015/2016')]
match_df_england_test = match_df_england_final_concat.loc[(match_df_england_final_concat['season'] == '2014/2015') | (match_df_england_final_concat['season'] == '2015/2016')]

In [9]:
X_train = match_df_england_train.iloc[:,36:]
Y_train = match_df_england_train['goals']
X_test = match_df_england_test.iloc[:,36:]
Y_test = match_df_england_test['goals']
sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)


In [10]:
#Backward Elimination
cols = list(X_train.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X_train[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(Y_train,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = np.max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

['pr_perc_draw', 'opponent_elo_rate', 'opponent_ma_goals', 'opponent_ma_sqrt_goals', 'opponent_ma_sqrt_shotons', 'opponent_ca_sqrt_shotoffs', 'opponent_ca_fouls', 'opponent_ma_pow2_fouls', 'opponent_ma_sqrt_fouls', 'opponent_ca_ycards', 'opponent_ca_sqrt_ycards', 'opponent_ca_crosses', 'opponent_ca_pow2_crosses', 'opponent_ca_sqrt_crosses', 'opponent_ca_pow2_possessions', 'opponent_ca_sqrt_possessions', 'team_elo_rate', 'team_ca_goals', 'team_ca_pow2_goals', 'team_ca_sqrt_goals', 'team_ma_sqrt_shotons', 'team_ma_pow2_shotoffs', 'team_ca_pow2_corners', 'team_ma_sqrt_corners', 'team_ca_possessions', 'homegame']


In [11]:
X_train_std_final=sc.fit_transform(X_train[selected_features_BE])
X_test_std_final=sc.transform(X_test[selected_features_BE])


In [12]:
###### Linear Reg ####

In [12]:
linreg = LinearRegression()
linreg.fit(X_train_std_final, Y_train.values.ravel())

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
y_pred = linreg.predict(X_test_std_final)

In [14]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test.values.ravel(), y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test.values.ravel(), y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test.values.ravel(), y_pred)))

Mean Absolute Error: 0.9141313077022921
Mean Squared Error: 1.3378917021292767
Root Mean Squared Error: 1.1566726858231229


In [None]:
###### XGBOOST ####

In [15]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
min_child_weight = [1, 5, 10]
gamma =[0.5, 1, 1.5, 2, 5]
subsample = [0.6, 0.8, 1.0]
colsample_bytree = [0.6, 0.8, 1.0]
max_depth = [int(x) for x in np.linspace(7, 100, num = 10)]
objective=['reg:squaredlogerror','reg:squarederror']
random_grid = {
        'gamma': gamma,
        'max_depth': max_depth,
        "min_child_weight":min_child_weight,
        "colsample_bytree": colsample_bytree,
        "n_estimators": n_estimators,
        "objective": objective
        }

xgbr = xgb.XGBRegressor()

random_search = RandomizedSearchCV(estimator=xgbr,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='neg_mean_absolute_error',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(X_train_std_final, Y_train.values.ravel())
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mae of a model with these hyperparameters is:")
print(random_search.best_score_)
print("The best regressor is: ")
print(random_search.best_estimator_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.7min finished


The best hyperparameters from Random Search are:
{'objective': 'reg:squaredlogerror', 'n_estimators': 60, 'min_child_weight': 5, 'max_depth': 7, 'gamma': 5, 'colsample_bytree': 0.8}

The mae of a model with these hyperparameters is:
-0.9304522464087004
The best regressor is: 
XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=5, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=7,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             n_estimators=60, n_jobs=0, num_parallel_tree=1,
             objective='reg:squaredlogerror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)


In [16]:
##########Grid Search CV ########
n_estimators = [60]
min_child_weight = [1, 5, 10]
gamma =[ 2, 5,7]
subsample = [0.8, 1.0,1.2]
colsample_bytree = [ 0.8, 1.0,1.2]
max_depth= [4, 5,7,10]
objective = ['reg:squaredlogerror']
        

# Create the param grid
param_grid = {
        'gamma': gamma,
        'max_depth': max_depth,
        "min_child_weight":min_child_weight,
        "colsample_bytree": colsample_bytree,
        "n_estimators": n_estimators,
        "objective": objective
        }

# First create the base model to tune
xgbr = xgb.XGBRegressor()
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

grid_search = GridSearchCV(estimator=xgbr, 
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train_std_final, Y_train.values.ravel())
print("The best hyperparameters from Random Search are:")
print(grid_search.best_params_)
print("")
print("The mae of a model with these hyperparameters is:")
print(grid_search.best_score_)
print("The best regressor is: ")
print(grid_search.best_estimator_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 324 out of 324 | elapsed:  1.6min finished


The best hyperparameters from Random Search are:
{'colsample_bytree': 0.8, 'gamma': 2, 'max_depth': 4, 'min_child_weight': 10, 'n_estimators': 60, 'objective': 'reg:squaredlogerror'}

The mae of a model with these hyperparameters is:
-0.9018850296811808
The best regressor is: 
XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=2, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=10, missing=nan, monotone_constraints=None,
             n_estimators=60, n_jobs=0, num_parallel_tree=1,
             objective='reg:squaredlogerror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)


In [17]:
###XGBOOST #####

xg_reg = xgb.XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=2, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=10, monotone_constraints=None,
             n_estimators=60, n_jobs=0, num_parallel_tree=1,
             objective='reg:squaredlogerror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)
xg_reg.fit(X_train_std_final, Y_train.values.ravel())

y_pred = xg_reg.predict(X_test_std_final)

In [18]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test.values.ravel(), y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test.values.ravel(), y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test.values.ravel(), y_pred)))

Mean Absolute Error: 0.905198842347149
Mean Squared Error: 1.3808794674716272
Root Mean Squared Error: 1.1751082790413943


In [None]:
### Random Forest Regression ####

In [19]:
###### Random Search #####
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(7, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]
# Method of selecting samples for training each tree
bootstrap = [True,False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rfr = RandomForestRegressor(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=rfr,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring='neg_mean_absolute_error',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(X_train_std_final, Y_train.values.ravel())
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mae of a model with these hyperparameters is:")
print(random_search.best_score_)
print("The best regressor is: ")
print(random_search.best_estimator_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 11.5min finished


The best hyperparameters from Random Search are:
{'n_estimators': 700, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 58, 'bootstrap': True}

The mae of a model with these hyperparameters is:
-0.9316778607128452
The best regressor is: 
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=58, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=700, n_jobs=None, oob_score=False,
                      random_state=8, verbose=0, warm_start=False)


In [20]:
bootstrap = [True]
max_depth = [40, 50, 60,70]
max_features = ['auto']
min_samples_leaf = [4, 6]
min_samples_split = [5, 10, 15]
n_estimators = [700]

param_grid = {
    'bootstrap': bootstrap,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators
}

# Create a base model
rfr = RandomForestRegressor(random_state=8)
# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rfr, 
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train_std_final, Y_train.values.ravel())
print("The best hyperparameters from Random Search are:")
print(grid_search.best_params_)
print("")
print("The mae of a model with these hyperparameters is:")
print(grid_search.best_score_)
print("The best regressor is: ")
print(grid_search.best_estimator_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 27.2min finished


The best hyperparameters from Random Search are:
{'bootstrap': True, 'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 6, 'min_samples_split': 15, 'n_estimators': 700}

The mae of a model with these hyperparameters is:
-0.9180470451022872
The best regressor is: 
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=6,
                      min_samples_split=15, min_weight_fraction_leaf=0.0,
                      n_estimators=700, n_jobs=None, oob_score=False,
                      random_state=8, verbose=0, warm_start=False)


In [21]:

rfr = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=6,
                      min_samples_split=15, min_weight_fraction_leaf=0.0,
                      n_estimators=700, n_jobs=None, oob_score=False,
                      random_state=8, verbose=0, warm_start=False)
rfr.fit(X_train_std_final, Y_train.values.ravel())

y_pred = rfr.predict(X_test_std_final)

In [22]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test.values.ravel(), y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test.values.ravel(), y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test.values.ravel(), y_pred)))

Mean Absolute Error: 0.9232996265496296
Mean Squared Error: 1.362433038856878
Root Mean Squared Error: 1.1672330696381412


In [None]:
#### SVR #######

In [23]:
#### Random Search ######

# C
C = [.0001, .001, .01]

# gamma
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']

# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree
             }

svr = SVR()

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svr,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='neg_mean_absolute_error',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(X_train_std_final, Y_train.values.ravel())
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mae of a model with these hyperparameters is:")
print(random_search.best_score_)
print("The best regressor is: ")
print(random_search.best_estimator_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  7.3min finished


The best hyperparameters from Random Search are:
{'kernel': 'linear', 'gamma': 0.0001, 'degree': 3, 'C': 0.01}

The mae of a model with these hyperparameters is:
-0.9125926026337945
The best regressor is: 
SVR(C=0.01, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001,
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [24]:
###### Grid Search ######

# C
C = [.001, .01,1]

# gamma
gamma = [.0001, .001, .01]

# degree
degree = [2, 3, 4]

# Create the random grid
param_grid = [
  {'C': C, 'kernel':['linear']},
  {'C': C, 'kernel':['poly'], 'degree':degree},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma}
]

# Create a base model
svr = SVR()
# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svr, 
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train_std_final, Y_train.values.ravel())
print("The best hyperparameters from Random Search are:")
print(grid_search.best_params_)
print("")
print("The mae of a model with these hyperparameters is:")
print(grid_search.best_score_)
print("The best regressor is: ")
print(grid_search.best_estimator_)


Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:   36.3s finished


The best hyperparameters from Random Search are:
{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}

The mae of a model with these hyperparameters is:
-0.8931900689810494
The best regressor is: 
SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [25]:
svr = SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
svr.fit(X_train_std_final, Y_train.values.ravel())
y_pred = svr.predict(X_test_std_final)

In [26]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test.values.ravel(), y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test.values.ravel(), y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test.values.ravel(), y_pred)))

Mean Absolute Error: 0.9008437659147246
Mean Squared Error: 1.332789928397513
Root Mean Squared Error: 1.154465213160411
