# README #

Bu dosya değişik kaynaklardan topladığımız veriyi bir araya getirerek model datasını oluşturur.

Veri kaynakları:
- 1- Premier League maç bazlı verisi -> 2013-2014 yılından 2017-2018 yılına kadar.
- 2- Takım Değerleri -> Transfermarkt'tan toplanılan yine sezon bazlı takım ortalama oyuncu değerleri (Mil Euro bazında)
- 3- Takım FIFA İstatistikleri -> FIFA'daki sezon bazlı takım istatistikleri (ATT -> Atak , DEF -> Defans, MID -> Midfield, OVR -> Overall)


> Bu kaynakları okuttuktan sonra ne yapıyoruz?
> - Sezonları dikkate alarak her şeyi tek bir tabloda birleştiriyoruz.
> - "TARGET" kolonunu yaratıyoruz. Bu kolon ev sahibi takım yendiyse 1, beraberlikse 2, deplasman yendiyse 3 oluyor.
> - Daha sonra kimin kazanacağını gösterebilecek "feature"ları yaratıyoruz. Feature'ları yaratmak için gerekli fonksiyonlar feature_dictionary klasörünün altındaki feature_generation_functions'un içindeki her bir .py dosyası. Bu fonksiyonlar elimizdeki tabloyu kullanarak, örneğin, iki takımın son üç maç ortalama gollerini hesaplıyor. Daha sonra bu "feature"lar aynı şekilde ana tablomuza ekleniyor.

> En son, elimize geçen tabloyu, raw_training_data klasöründe raw_training_data.csv olarak kaydediyoruz. RAW dememin sebebi: elimizdeki tablo şu an her şeyi kapsıyor, takımları, hakemleri vesaire. Modellemede bu kolonlara ihtiyacımız olmayacak. 

Buraya kadar okuduysanız training_test_data_kernel.ipynb dosyasına geçebilirsiniz.

In [1]:
import pandas as pd
import numpy as np
import datetime
import os

In [2]:
# Read game-by-game data for each season
df1 = pd.read_csv('./dataset/game_data/season-1314_csv.csv')
df2 = pd.read_csv('./dataset/game_data/season-1415_csv.csv')
df3 = pd.read_csv('./dataset/game_data/season-1516_csv.csv')
df4 = pd.read_csv('./dataset/game_data/season-1617_csv.csv')
df5 = pd.read_csv('./dataset/game_data/season-1718_csv.csv')

# Read Market Values Data
team_values=pd.read_csv('./dataset/team_values/team_values.csv')
team_values.set_index('Team',inplace=True)

# Read Team Fifa Stats Data
team_fifa_stats = pd.read_csv('./dataset/team_fifa_stats/team_fifa_stats.csv')
team_fifa_stats.set_index('Orj_Team',inplace=True)

# Concatenate data frames and get rid of old ones
main = pd.concat([df1,df2,df3,df4,df5],axis=0)
del df1, df2, df3, df4, df5

In [81]:
# Create TARGET variable which is equal to 1 if Home team wins, 2 if there is a draw, and 3 if Away team wins
main['TARGET'] = (main.FTHG > main.FTAG)*1+((main.FTHG == main.FTAG)*1).replace(1,2)+((main.FTHG < main.FTAG)*1).replace(1,3)
main.reset_index(inplace=True)

In [21]:
# Run all the functions needed for feature generation
file_list = [e for e in os.listdir('./feature_dictionary/fetaure_generation_functions') if e[-3:] == '.py']
for file in file_list:
    exec(open('./feature_dictionary/fetaure_generation_functions/'+file).read())

In [22]:
train_df = main.copy()

In [23]:
train_df['HOME_LG_GOALS_L1W']=train_df.apply(func_HOME_LG_GOALS_L1W,axis=1)
train_df['HOME_LG_GOALS_L2W']=train_df.apply(func_HOME_LG_GOALS_L2W,axis=1)
train_df['HOME_LG_GOALS_L3W']=train_df.apply(func_HOME_LG_GOALS_L3W,axis=1)
train_df['HOME_LG_GOALS_L3W_COMB']=train_df.HOME_LG_GOALS_L1W+train_df.HOME_LG_GOALS_L2W+train_df.HOME_LG_GOALS_L3W

In [24]:
train_df['HOME_LG_GOALS_H_L1W']=train_df.apply(func_HOME_LG_GOALS_H_L1W,axis=1)
train_df['HOME_LG_GOALS_H_L2W']=train_df.apply(func_HOME_LG_GOALS_H_L2W,axis=1)
train_df['HOME_LG_GOALS_H_L3W']=train_df.apply(func_HOME_LG_GOALS_H_L3W,axis=1)
train_df['HOME_LG_GOALS_H_L3W_COMB']=train_df.HOME_LG_GOALS_H_L1W+train_df.HOME_LG_GOALS_H_L2W+train_df.HOME_LG_GOALS_H_L3W

In [25]:
train_df['HOME_LG_GOALS_A_L1W']=train_df.apply(func_HOME_LG_GOALS_A_L1W,axis=1)
train_df['HOME_LG_GOALS_A_L2W']=train_df.apply(func_HOME_LG_GOALS_A_L2W,axis=1)
train_df['HOME_LG_GOALS_A_L3W']=train_df.apply(func_HOME_LG_GOALS_A_L3W,axis=1)
train_df['HOME_LG_GOALS_A_L3W_COMB']=train_df.HOME_LG_GOALS_A_L1W+train_df.HOME_LG_GOALS_A_L2W+train_df.HOME_LG_GOALS_A_L3W

In [26]:
train_df['HOME_LG_SHOTS_L1W']=train_df.apply(func_HOME_LG_SHOTS_L1W,axis=1)
train_df['HOME_LG_SHOTS_L2W']=train_df.apply(func_HOME_LG_SHOTS_L2W,axis=1)
train_df['HOME_LG_SHOTS_L3W']=train_df.apply(func_HOME_LG_SHOTS_L3W,axis=1)
train_df['HOME_LG_SHOTS_L3W_COMB']=train_df.HOME_LG_SHOTS_L1W+train_df.HOME_LG_SHOTS_L2W+train_df.HOME_LG_SHOTS_L3W

In [27]:
train_df['HOME_LG_SHOTS_H_L1W']=train_df.apply(func_HOME_LG_SHOTS_H_L1W,axis=1)
train_df['HOME_LG_SHOTS_H_L2W']=train_df.apply(func_HOME_LG_SHOTS_H_L2W,axis=1)
train_df['HOME_LG_SHOTS_H_L3W']=train_df.apply(func_HOME_LG_SHOTS_H_L3W,axis=1)
train_df['HOME_LG_SHOTS_H_L3W_COMB']=train_df.HOME_LG_SHOTS_H_L1W+train_df.HOME_LG_SHOTS_H_L2W+train_df.HOME_LG_SHOTS_H_L3W

In [28]:
train_df['HOME_LG_SHOTS_A_L1W']=train_df.apply(func_HOME_LG_SHOTS_A_L1W,axis=1)
train_df['HOME_LG_SHOTS_A_L2W']=train_df.apply(func_HOME_LG_SHOTS_A_L2W,axis=1)
train_df['HOME_LG_SHOTS_A_L3W']=train_df.apply(func_HOME_LG_SHOTS_A_L3W,axis=1)
train_df['HOME_LG_SHOTS_A_L3W_COMB']=train_df.HOME_LG_SHOTS_A_L1W+train_df.HOME_LG_SHOTS_A_L2W+train_df.HOME_LG_SHOTS_A_L3W

In [29]:
train_df['HOME_LG_SHOTS_OT_L1W']=train_df.apply(func_HOME_LG_SHOTS_OT_L1W,axis=1)
train_df['HOME_LG_SHOTS_OT_L2W']=train_df.apply(func_HOME_LG_SHOTS_OT_L2W,axis=1)
train_df['HOME_LG_SHOTS_OT_L3W']=train_df.apply(func_HOME_LG_SHOTS_OT_L3W,axis=1)
train_df['HOME_LG_SHOTS_OT_L3W_COMB']=train_df.HOME_LG_SHOTS_OT_L1W+train_df.HOME_LG_SHOTS_OT_L2W+train_df.HOME_LG_SHOTS_OT_L3W

In [30]:
train_df['HOME_LG_SHOTS_OT_H_L1W']=train_df.apply(func_HOME_LG_SHOTS_OT_H_L1W,axis=1)
train_df['HOME_LG_SHOTS_OT_H_L2W']=train_df.apply(func_HOME_LG_SHOTS_OT_H_L2W,axis=1)
train_df['HOME_LG_SHOTS_OT_H_L3W']=train_df.apply(func_HOME_LG_SHOTS_OT_H_L3W,axis=1)
train_df['HOME_LG_SHOTS_OT_H_L3W_COMB']=train_df.HOME_LG_SHOTS_OT_H_L1W+train_df.HOME_LG_SHOTS_OT_H_L2W+train_df.HOME_LG_SHOTS_OT_H_L3W

In [31]:
train_df['HOME_LG_SHOTS_OT_A_L1W']=train_df.apply(func_HOME_LG_SHOTS_OT_A_L1W,axis=1)
train_df['HOME_LG_SHOTS_OT_A_L2W']=train_df.apply(func_HOME_LG_SHOTS_OT_A_L2W,axis=1)
train_df['HOME_LG_SHOTS_OT_A_L3W']=train_df.apply(func_HOME_LG_SHOTS_OT_A_L3W,axis=1)
train_df['HOME_LG_SHOTS_OT_A_L3W_COMB']=train_df.HOME_LG_SHOTS_OT_A_L1W+train_df.HOME_LG_SHOTS_OT_A_L2W+train_df.HOME_LG_SHOTS_OT_A_L3W

In [32]:
train_df['AWAY_LG_GOALS_L1W']=train_df.apply(func_AWAY_LG_GOALS_L1W,axis=1)
train_df['AWAY_LG_GOALS_L2W']=train_df.apply(func_AWAY_LG_GOALS_L2W,axis=1)
train_df['AWAY_LG_GOALS_L3W']=train_df.apply(func_AWAY_LG_GOALS_L3W,axis=1)
train_df['AWAY_LG_GOALS_L3W_COMB']=train_df.AWAY_LG_GOALS_L1W+train_df.AWAY_LG_GOALS_L2W+train_df.AWAY_LG_GOALS_L3W

In [33]:
train_df['AWAY_LG_GOALS_H_L1W']=train_df.apply(func_AWAY_LG_GOALS_H_L1W,axis=1)
train_df['AWAY_LG_GOALS_H_L2W']=train_df.apply(func_AWAY_LG_GOALS_H_L2W,axis=1)
train_df['AWAY_LG_GOALS_H_L3W']=train_df.apply(func_AWAY_LG_GOALS_H_L3W,axis=1)
train_df['AWAY_LG_GOALS_H_L3W_COMB']=train_df.AWAY_LG_GOALS_H_L1W+train_df.AWAY_LG_GOALS_H_L2W+train_df.AWAY_LG_GOALS_H_L3W

In [34]:
train_df['AWAY_LG_GOALS_A_L1W']=train_df.apply(func_AWAY_LG_GOALS_A_L1W,axis=1)
train_df['AWAY_LG_GOALS_A_L2W']=train_df.apply(func_AWAY_LG_GOALS_A_L2W,axis=1)
train_df['AWAY_LG_GOALS_A_L3W']=train_df.apply(func_AWAY_LG_GOALS_A_L3W,axis=1)
train_df['AWAY_LG_GOALS_A_L3W_COMB']=train_df.AWAY_LG_GOALS_A_L1W+train_df.AWAY_LG_GOALS_A_L2W+train_df.AWAY_LG_GOALS_A_L3W

In [35]:
train_df['AWAY_LG_SHOTS_L1W']=train_df.apply(func_AWAY_LG_SHOTS_L1W,axis=1)
train_df['AWAY_LG_SHOTS_L2W']=train_df.apply(func_AWAY_LG_SHOTS_L2W,axis=1)
train_df['AWAY_LG_SHOTS_L3W']=train_df.apply(func_AWAY_LG_SHOTS_L3W,axis=1)
train_df['AWAY_LG_SHOTS_L3W_COMB']=train_df.AWAY_LG_SHOTS_L1W+train_df.AWAY_LG_SHOTS_L2W+train_df.AWAY_LG_SHOTS_L3W

In [36]:
train_df['AWAY_LG_SHOTS_H_L1W']=train_df.apply(func_AWAY_LG_SHOTS_H_L1W,axis=1)
train_df['AWAY_LG_SHOTS_H_L2W']=train_df.apply(func_AWAY_LG_SHOTS_H_L2W,axis=1)
train_df['AWAY_LG_SHOTS_H_L3W']=train_df.apply(func_AWAY_LG_SHOTS_H_L3W,axis=1)
train_df['AWAY_LG_SHOTS_H_L3W_COMB']=train_df.AWAY_LG_SHOTS_H_L1W+train_df.AWAY_LG_SHOTS_H_L2W+train_df.AWAY_LG_SHOTS_H_L3W

In [37]:
train_df['AWAY_LG_SHOTS_A_L1W']=train_df.apply(func_AWAY_LG_SHOTS_A_L1W,axis=1)
train_df['AWAY_LG_SHOTS_A_L2W']=train_df.apply(func_AWAY_LG_SHOTS_A_L2W,axis=1)
train_df['AWAY_LG_SHOTS_A_L3W']=train_df.apply(func_AWAY_LG_SHOTS_A_L3W,axis=1)
train_df['AWAY_LG_SHOTS_A_L3W_COMB']=train_df.AWAY_LG_SHOTS_A_L1W+train_df.AWAY_LG_SHOTS_A_L2W+train_df.AWAY_LG_SHOTS_A_L3W

In [38]:
train_df['AWAY_LG_SHOTS_OT_L1W']=train_df.apply(func_AWAY_LG_SHOTS_OT_L1W,axis=1)
train_df['AWAY_LG_SHOTS_OT_L2W']=train_df.apply(func_AWAY_LG_SHOTS_OT_L2W,axis=1)
train_df['AWAY_LG_SHOTS_OT_L3W']=train_df.apply(func_AWAY_LG_SHOTS_OT_L3W,axis=1)
train_df['AWAY_LG_SHOTS_OT_L3W_COMB']=train_df.AWAY_LG_SHOTS_OT_L1W+train_df.AWAY_LG_SHOTS_OT_L2W+train_df.AWAY_LG_SHOTS_OT_L3W

In [39]:
train_df['AWAY_LG_SHOTS_OT_H_L1W']=train_df.apply(func_AWAY_LG_SHOTS_OT_H_L1W,axis=1)
train_df['AWAY_LG_SHOTS_OT_H_L2W']=train_df.apply(func_AWAY_LG_SHOTS_OT_H_L2W,axis=1)
train_df['AWAY_LG_SHOTS_OT_H_L3W']=train_df.apply(func_AWAY_LG_SHOTS_OT_H_L3W,axis=1)
train_df['AWAY_LG_SHOTS_OT_H_L3W_COMB']=train_df.AWAY_LG_SHOTS_OT_H_L1W+train_df.AWAY_LG_SHOTS_OT_H_L2W+train_df.AWAY_LG_SHOTS_OT_H_L3W

In [40]:
train_df['AWAY_LG_SHOTS_OT_A_L1W']=train_df.apply(func_AWAY_LG_SHOTS_OT_A_L1W,axis=1)
train_df['AWAY_LG_SHOTS_OT_A_L2W']=train_df.apply(func_AWAY_LG_SHOTS_OT_A_L2W,axis=1)
train_df['AWAY_LG_SHOTS_OT_A_L3W']=train_df.apply(func_AWAY_LG_SHOTS_OT_A_L3W,axis=1)
train_df['AWAY_LG_SHOTS_OT_A_L3W_COMB']=train_df.AWAY_LG_SHOTS_OT_A_L1W+train_df.AWAY_LG_SHOTS_OT_A_L2W+train_df.AWAY_LG_SHOTS_OT_A_L3W

In [41]:
train_df['HOME_MKT_VAL']=train_df.apply(func_HOME_MKT_VAL,axis=1)
train_df['AWAY_MKT_VAL']=train_df.apply(func_AWAY_MKT_VAL,axis=1)
train_df['ABS_DIFF_MKT_VAL']=abs(train_df.HOME_MKT_VAL-train_df.AWAY_MKT_VAL)

In [42]:
train_df['HOME_L0S_FIFA_SCR_ATK']=train_df.apply(func_HOME_L0S_FIFA_SCR_ATK, axis=1)
train_df['HOME_L0S_FIFA_SCR_ORT']=train_df.apply(func_HOME_L0S_FIFA_SCR_ORT, axis=1)
train_df['HOME_L0S_FIFA_SCR_DEF']=train_df.apply(func_HOME_L0S_FIFA_SCR_DEF, axis=1)
train_df['HOME_L0S_FIFA_SCR_OVR']=train_df.apply(func_HOME_L0S_FIFA_SCR_OVR, axis=1)

In [43]:
train_df['AWAY_L0S_FIFA_SCR_ATK']=train_df.apply(func_AWAY_L0S_FIFA_SCR_ATK, axis=1)
train_df['AWAY_L0S_FIFA_SCR_ORT']=train_df.apply(func_AWAY_L0S_FIFA_SCR_ORT, axis=1)
train_df['AWAY_L0S_FIFA_SCR_DEF']=train_df.apply(func_AWAY_L0S_FIFA_SCR_DEF, axis=1)
train_df['AWAY_L0S_FIFA_SCR_OVR']=train_df.apply(func_AWAY_L0S_FIFA_SCR_OVR, axis=1)

In [44]:
train_df['ABS_DIFF_L0S_FIFA_SCR_ATK']=abs(train_df.HOME_L0S_FIFA_SCR_ATK-train_df.AWAY_L0S_FIFA_SCR_ATK)
train_df['ABS_DIFF_L0S_FIFA_SCR_ORT']=abs(train_df.HOME_L0S_FIFA_SCR_ORT-train_df.AWAY_L0S_FIFA_SCR_ORT)
train_df['ABS_DIFF_L0S_FIFA_SCR_DEF']=abs(train_df.HOME_L0S_FIFA_SCR_DEF-train_df.AWAY_L0S_FIFA_SCR_DEF)
train_df['ABS_DIFF_L0S_FIFA_SCR_OVR']=abs(train_df.HOME_L0S_FIFA_SCR_OVR-train_df.AWAY_L0S_FIFA_SCR_OVR)

In [45]:
train_df['H2H_TOT_CORNERS_HA_L1G']=train_df.apply(func_H2H_TOT_CORNERS_HA_L1G, axis=1)
train_df['H2H_TOT_CORNERS_HA_L2G']=train_df.apply(func_H2H_TOT_CORNERS_HA_L2G, axis=1)
train_df['H2H_TOT_CORNERS_HA_L3G']=train_df.apply(func_H2H_TOT_CORNERS_HA_L3G, axis=1)
train_df['H2H_TOT_CORNERS_HA_L3G_AVG']=np.round(np.nanmean([train_df.H2H_TOT_CORNERS_HA_L1G,train_df.H2H_TOT_CORNERS_HA_L2G,train_df.H2H_TOT_CORNERS_HA_L3G],axis=0),2)

  after removing the cwd from sys.path.


In [46]:
train_df['H2H_TOT_SHOTS_HA_L1G']=train_df.apply(func_H2H_TOT_SHOTS_HA_L1G, axis=1)
train_df['H2H_TOT_SHOTS_HA_L2G']=train_df.apply(func_H2H_TOT_SHOTS_HA_L2G, axis=1)
train_df['H2H_TOT_SHOTS_HA_L3G']=train_df.apply(func_H2H_TOT_SHOTS_HA_L3G, axis=1)
train_df['H2H_TOT_SHOTS_HA_L3G_AVG']=np.round(np.nanmean([train_df.H2H_TOT_SHOTS_HA_L1G,train_df.H2H_TOT_SHOTS_HA_L2G,train_df.H2H_TOT_SHOTS_HA_L3G],axis=0),2)

  after removing the cwd from sys.path.


In [78]:
train_df['Season']=train_df.apply(findSeason,axis=1)

In [80]:
train_df.to_csv('./raw_training_data/raw_training_data.csv',index=False)