# Fantasy league characterization

## Data load

In [1]:
import pandas as pd
import matplotlib.pylab as plt
import warnings
import numpy as np


pd.set_option('display.max_rows', 500)

## Data augmentation

In [2]:
stats = pd.read_csv('./Data/stats.csv')
players = pd.read_csv('./Data/players.csv')

players['is_T'] = False
players.loc[players[(players['ruolo']=='T (A)') | (players['ruolo']=='T (C)')].index, 'is_T'] = True
players['ruolo'] = players['ruolo'].replace('T (A)', 'A')
players['ruolo'] = players['ruolo'].replace('T (C)', 'C')

players['w_media_magic_voto'] = players['media_magic_voto'].fillna(0) * players['partite_giocate'].fillna(0)
players['w_media_punti'] = players['media_punti'] * players['partite_giocate']

w_factor=0.10
players['forcast_quotazione'] = np.exp(w_factor*players.quotazione)

players = players.reset_index()
players = players.rename(columns={'index':'code'})

players['index_yyss'] = players['yyss'].replace({'2014-15':0, 
                                                 '2015-16':1, 
                                                 '2016-17':2, 
                                                 '2017-18':3, 
                                                 '2018-19':4, 
                                                 '2019-20':5})

players['giocatore'] = players.giocatore.str.lower()

players['roi'] = np.nan 
yyss = list(players.yyss.unique())
for ys in yyss:
    indeces = players[players.yyss == ys].index
    players.loc[indeces, 'roi'] = players.media_punti.div(players.quotazione).mul(100)
#add off-set in order to have positive ROIs
players.roi = players.roi.add(abs(players.roi.min()))

## Best Team, Uncapped

In [3]:
#best uncapped solution 
ys = '2019-20'
columns=['squadra', 'partite_giocate','giocatore', 'quotazione','forcast_quotazione','media_voto', 'media_magic_voto',
         'media_punti', 'roi', 'w_media_magic_voto', 'w_media_punti']
# columns = players.columns
s1920 = players[players['yyss'] == ys]
best_p = s1920[(s1920['ruolo']=='P') & (~s1920['w_media_magic_voto'].isna() )][columns]\
        .sort_values('w_media_magic_voto')\
        .tail(3)

best_d = s1920[(s1920['ruolo']=='D') & (~s1920['w_media_magic_voto'].isna() )][columns]\
        .sort_values('w_media_magic_voto')\
        .tail(8)

best_c = s1920[(s1920['ruolo']=='C') & (~s1920['w_media_magic_voto'].isna() )][columns]\
        .sort_values('w_media_magic_voto')\
        .tail(8)

best_a = s1920[(s1920['ruolo']=='A') & (~s1920['w_media_magic_voto'].isna() )][columns]\
        .sort_values('w_media_magic_voto')\
        .tail(6)

best = best_p.append(best_d).append(best_c).append(best_a)
best


Unnamed: 0,squadra,partite_giocate,giocatore,quotazione,forcast_quotazione,media_voto,media_magic_voto,media_punti,roi,w_media_magic_voto,w_media_punti
3018,milan,36.0,donnarumma g.,25.0,12.182494,6.29,5.41,-31.6,2523.6,194.76,-1137.6
2773,udinese,38.0,musso j.,20.0,7.389056,6.38,5.18,-45.6,2422.0,196.84,-1732.8
3117,lazio,38.0,strakosha t.,21.0,8.16617,6.31,5.26,-39.9,2460.0,199.88,-1516.2
3228,atalanta,34.0,djimsiti b.,14.0,4.0552,6.08,6.3,7.5,2703.571429,214.2,255.0
3233,juventus,35.0,bonucci l.,16.0,4.953032,6.18,6.44,9.1,2706.875,225.4,318.5
3218,milan,33.0,hernandez t.,22.0,9.025013,6.3,6.96,21.8,2749.090909,229.68,719.4
3109,lazio,36.0,acerbi f.,18.0,6.049647,6.18,6.43,9.0,2700.0,231.48,324.0
2989,inter,34.0,de vrij s.,24.0,11.023176,6.35,6.83,16.3,2717.916667,232.22,554.2
2839,fiorentina,37.0,milenkovic n.,19.0,6.685894,6.02,6.51,18.2,2745.789474,240.87,673.4
3125,verona,36.0,faraoni d.,20.0,7.389056,6.12,6.75,22.7,2763.5,243.0,817.2


## Cocca Index

w_media_magic_voto is the features mutliplies the average magic voto with the number of played games.

Cocca_index = average point between the gradient of regressand on 5 years and the the gradient of last two years. The more it tends to positve the more the player is prone to improve

In [4]:
from sklearn.preprocessing import Normalizer
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns

def loo_prediction(regressor, X, y):
    test_val = [X[-1][0], y[-1][0]]
    new_X = X[:-1]
    new_y = y[:-1]
    
   
    
    if len(y) < 3 or np.isscalar(y):
        return np.nan, np.nan, np.nan, np.nan, np.nan
    
    reg = regressor.fit(new_X,new_y)
    predicted_X = test_val[0]
    predicted_Y = reg.predict([[predicted_X]])
    err = abs(test_val[1] - predicted_Y) / abs(predicted_Y)
    err = err[0][0]
    
    
    full_regression = regressor.fit(X,y)
    past_allys_value = full_regression.predict([[6]])[0][0]
    
    
    new_X = X[-2:-1]
    new_y = y[-2:-1]
    past_2ys = regressor.fit(new_X, new_y)
    past_2ys_value = past_2ys.predict([[6]])[0][0]
    
    return err, past_allys_value, past_2ys_value, predicted_X, predicted_Y[0][0]


def format_player_history(df):
    df = df[['index_yyss', 'w_media_magic_voto']]
    missing_yyss = set(range(0,6)).difference(set(df.index_yyss.unique()))
    values_list = df.values
    X = df.index_yyss.values.tolist()
    X = [[x] for x in X]
    
    y = df.w_media_magic_voto.values
    y = [[y_val] for y_val in y]

    return X, y, values_list

def plot_scatter_CI(players_df, reg_df, player_name):
    
    df= players_df[players_df.giocatore.str.contains(player_name)]
    df = df.sort_values(by='index_yyss')
    
    df2 = reg_df
    
    fig,ax=plt.subplots(figsize=(10,10))
    df= players_df[players_df.giocatore.str.contains(player_name)]
    sns.regplot(x="index_yyss", y="w_media_magic_voto", data=df, ax=ax, label='Full Reg')
    sns.regplot(x="index_yyss", y="w_media_magic_voto", data=df.head(len(df)-1), ax=ax, label='N-1 Regr')
    sns.regplot(x="index_yyss", y="w_media_magic_voto", data=df.tail(2), ax=ax, label='Last 2Ys reg')
    
    ax.scatter()
    
    ax.set_title(player_name)
    ax.legend()

  import pandas.util.testing as tm


In [5]:
dataset = s1920.set_index('giocatore')

new_columns = ['err', 'past_allys_value', 'past_2ys_value', 'predicted_X', 'predicted_Y']
for column in new_columns:
    dataset.insert(len(dataset.columns), column, np.nan)

regressor = LinearRegression()
players_list = s1920.giocatore.unique().tolist()
# players_list = ['ghiglione p.']
for player_name in players_list:
    player_history=players[players.giocatore.str.contains(player_name)]
    X,y, _ = format_player_history(player_history)
    dataset.loc[player_name, new_columns] = loo_prediction(regressor, X, y)



  linalg.lstsq(X, y)


In [6]:
valid_pred = dataset[(~dataset.err.isna() & (dataset.err < 10e5) & 
                      (dataset['past_2ys_value']!=0))]
valid_pred = valid_pred.sort_values('err')
column2print = ['code', 'squadra']+ new_columns
valid_pred = valid_pred[column2print]
valid_pred['CI'] = (valid_pred['past_allys_value'] - valid_pred['past_2ys_value'])/valid_pred['past_2ys_value']

In [7]:
files = ['attaccanti.txt', 'centrocampisti.txt', 'difensori.txt', 'portieri.txt']
data_path = './Data/Grippi/%s'

consigli_df = pd.DataFrame()
for file in files:
    temp = pd.read_csv(data_path%file)
    temp['ruolo'] = file.replace('.txt','')[0].upper()
    temp['giocatore'] = temp['giocatore'].str.lower()
    consigli_df = consigli_df.append(temp)
consigli_df.shape

(490, 3)

In [8]:
valid_df_mod = valid_pred.copy()
valid_df_mod.reset_index(inplace=True)
valid_df_mod['giocatore'] = valid_df_mod.giocatore.str[:-3]
valid_df_mod
consigli_df_ci = consigli_df.merge(
    valid_df_mod[['giocatore','CI']],
    left_on='giocatore', right_on='giocatore',
    how='left'
)
consigli_df_ci.to_csv(data_path%'consigli_df_CI.csv', index=False, sep=';')