# A Regression of Home Scoring for MLS data 

## Data Wrangling 

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns

%matplotlib inline 
sns.set()


df = pd.read_csv('matches.csv')

list_of_columns = [
 'league',
 'home_score',
 'home_possessionPct',
 'away_possessionPct',
 'home_shotsSummary',
 'away_shotsSummary',
 'home_foulsCommitted',
 'away_foulsCommitted',
 'home_yellowCards',
 'away_yellowCards',
 'home_redCards',
 'away_redCards',
 'home_offsides',
 'away_offsides',
 'home_wonCorners',
 'away_wonCorners',
 'home_saves',
 'away_saves'
]

df = df.loc[:, list_of_columns]

df = df.dropna()

# unpacking shot summaries 
def shots(shot_summary):
    interm_string = shot_summary.strip(')')
    entries = interm_string.split('(')
    return int(entries[0])

def shots_on_goal(shot_summary):
    interm_string = shot_summary.strip(')')
    entries = interm_string.split('(')
    return int(entries[1])

df['home_shots'] = df['home_shotsSummary'].apply(shots)
df['home_shots_on_goal'] = df['home_shotsSummary'].apply(shots_on_goal)
df = df.drop('home_shotsSummary', axis=1)

df['away_shots'] = df['away_shotsSummary'].apply(shots)
df['away_shots_on_goal'] = df['away_shotsSummary'].apply(shots_on_goal)
df = df.drop('away_shotsSummary', axis=1)


# Formatting the percentages columns 
def strip_perc_symbol(perc):
    return float(perc.strip('%'))

df['home_possessionPct'] = df['home_possessionPct'].apply(strip_perc_symbol)
df['away_possessionPct'] = df['away_possessionPct'].apply(strip_perc_symbol)


# Creating train and test sets 

from sklearn.model_selection import StratifiedShuffleSplit

df_restr = df[(df['away_possessionPct'] != 0) & (df['home_possessionPct']!=0)]

df_restr['home_shots_cat'] = pd.cut(df_restr['home_shots'], bins=[-1, 10, 20, np.inf], labels=['low', 'medium', 'high'])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in split.split(df_restr, df_restr["home_shots_cat"]):
    strat_train_set = df_restr.iloc[train_index]
    strat_test_set = df_restr.iloc[test_index]

matches = strat_train_set.copy()
matches.drop(['home_shots_cat', 'home_score'], axis=1, inplace=True)




matches_num = matches.drop('league', axis=1)
matches_labels = strat_train_set['home_score'].copy()



# Making Transformers and Pipelines 

def safe_quotient(x,y):
    if y != 0:
        return x/y 
    else:
        return 0 

f_safe_quotient = np.vectorize(safe_quotient)



from sklearn.base import BaseEstimator, TransformerMixin

home_shots_on_goal_ix = -3
home_shots_ix = -4
away_shots_on_goal_ix = -1
away_shots_ix = -2
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_accuracy=True):
        self.add_accuracy = add_accuracy
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        home_accuracy = f_safe_quotient(X[:, home_shots_on_goal_ix], X[:, home_shots_ix])
        away_accuracy = f_safe_quotient(X[:, away_shots_on_goal_ix], X[:, away_shots_ix])
        if self.add_accuracy:
            return np.c_[X, home_accuracy, away_accuracy]
        else:
            return X




from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer 

num_pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()), 
    ('std_scaler', StandardScaler())
])



from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(matches_num)
cat_attribs = ['league']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs), 
    ('cat', OneHotEncoder(), cat_attribs)
])

matches_prepared = full_pipeline.fit_transform(matches)



  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_restr['home_shots_cat'] = pd.cut(df_restr['home_shots'], bins=[-1, 10, 20, np.inf], labels=['low', 'medium', 'high'])


In [2]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(matches_prepared, matches_labels)

LinearRegression()

In [3]:
from sklearn.metrics import mean_squared_error
matches_predictions = lin_reg.predict(matches_prepared)
lin_mse = mean_squared_error(matches_labels, matches_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.5247322976836689