In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [71]:
train_home_player_statistics_df = pd.read_csv('./train_home_player_statistics_df.csv', index_col=0)
train_away_player_statistics_df = pd.read_csv('./train_away_player_statistics_df.csv', index_col=0)
test_away_player_statistics_df = pd.read_csv('./test_away_player_statistics_df.csv', index_col=0)


train_scores = pd.read_csv('./Y_train.csv', index_col=0)

Let's explore the available features :

In [6]:
features = test_away_player_statistics_df.keys()
seasons = []
matches = []
for ft in features:
    if("season" in ft):
        seasons.append(ft)
    if("5" in ft):
        matches.append(ft)
    
for match in matches:
    print(match)

for season in seasons:
    print(season)

print(len(seasons), " , ", len(matches))

PLAYER_ACCURATE_CROSSES_5_last_match_sum
PLAYER_ACCURATE_PASSES_5_last_match_sum
PLAYER_AERIALS_WON_5_last_match_sum
PLAYER_ASSISTS_5_last_match_sum
PLAYER_BIG_CHANCES_CREATED_5_last_match_sum
PLAYER_BIG_CHANCES_MISSED_5_last_match_sum
PLAYER_BLOCKED_SHOTS_5_last_match_sum
PLAYER_CAPTAIN_5_last_match_sum
PLAYER_CLEARANCES_5_last_match_sum
PLAYER_CLEARANCE_OFFLINE_5_last_match_sum
PLAYER_DISPOSSESSED_5_last_match_sum
PLAYER_DRIBBLED_ATTEMPTS_5_last_match_sum
PLAYER_DRIBBLED_PAST_5_last_match_sum
PLAYER_DUELS_LOST_5_last_match_sum
PLAYER_DUELS_WON_5_last_match_sum
PLAYER_ERROR_LEAD_TO_GOAL_5_last_match_sum
PLAYER_FOULS_5_last_match_sum
PLAYER_FOULS_DRAWN_5_last_match_sum
PLAYER_GOALKEEPER_GOALS_CONCEDED_5_last_match_sum
PLAYER_GOALS_5_last_match_sum
PLAYER_GOALS_CONCEDED_5_last_match_sum
PLAYER_HIT_WOODWORK_5_last_match_sum
PLAYER_INTERCEPTIONS_5_last_match_sum
PLAYER_KEY_PASSES_5_last_match_sum
PLAYER_MINUTES_PLAYED_5_last_match_sum
PLAYER_OFFSIDES_5_last_match_sum
PLAYER_OWN_GOALS_5_la

In [7]:
print(train_away_player_statistics_df)
print(train_home_player_statistics_df)

        LEAGUE            TEAM_NAME    POSITION            PLAYER_NAME  \
ID                                                                       
0      Ligue 1  Olympique Marseille    defender           Lucas Perrin   
0      Ligue 1  Olympique Marseille  midfielder        Kevin Strootman   
0      Ligue 1  Olympique Marseille  goalkeeper            Yohann Pelé   
0      Ligue 1  Olympique Marseille    defender   Abdallah Ali Mohamed   
0      Ligue 1  Olympique Marseille         NaN       Nemanja Radonjic   
...        ...                  ...         ...                    ...   
12302  La Liga         FC Barcelona  goalkeeper  Marc-André ter Stegen   
12302  La Liga         FC Barcelona  midfielder           Alex Collado   
12302  La Liga         FC Barcelona    attacker           Lionel Messi   
12302  La Liga         FC Barcelona    attacker      Antoine Griezmann   
12302  La Liga         FC Barcelona  midfielder              Ansu Fati   

       PLAYER_ACCURATE_CROSSES_season

The problem : a lot of data is missing

In [72]:
df = train_away_player_statistics_df.reset_index().groupby(["POSITION", "ID"], as_index=False).sum()
gb = df.set_index("ID").groupby("POSITION")
positions = ["attacker", "goalkeeper", "midfielder", "defender"]
m1 = np.intersect1d(gb.get_group(positions[0]).index, gb.get_group(positions[1]).index)
m2 = np.intersect1d(gb.get_group(positions[2]).index, gb.get_group(positions[3]).index)
print(np.intersect1d(m1, m2).shape[0])

5785


Less than 6k matches over the 12k matches have a complete away team (with at least one player in each position). To tackle this problem, let's predict the position of the player knowing the rest :

In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model

print(train_home_player_statistics_df.groupby("POSITION").count().reset_index().iloc[:,:4])
print(train_away_player_statistics_df.groupby("POSITION").count().reset_index().iloc[:,:4])

train_home_player = train_home_player_statistics_df.loc[train_home_player_statistics_df.isna().loc[:,"POSITION"] == False]
train_away_player = train_away_player_statistics_df.loc[train_away_player_statistics_df.isna().loc[:,"POSITION"] == False]

le = LabelEncoder()
train_home_player["POSITION"] = le.fit_transform(train_home_player["POSITION"])
train_away_player["POSITION"] = le.fit_transform(train_away_player["POSITION"])

encoding = ["attacker", "defender", "goalkeeper", "midfielder"]

train_player_data = pd.concat((train_home_player, train_away_player))
train_y = train_player_data["POSITION"]
train_player_data = train_player_data.iloc[:,4:]

X_train, X_test, y_train, y_test = model_selection.train_test_split(train_player_data, train_y, train_size=0.8, random_state=42)

lin_model = linear_model.LogisticRegression(C=1.0)

lin_model.fit(X_train.replace({np.nan:0.0}), y_train)

y_pred = lin_model.predict(X_test.replace({np.nan:0.0}))
predictions = y_pred
print("test : ", np.round(accuracy_score(predictions,y_test),4))

     POSITION  LEAGUE  TEAM_NAME  PLAYER_NAME
0    attacker   23691      23691        23691
1    defender   67746      67746        67746
2  goalkeeper   24298      24298        24298
3  midfielder   78524      78524        78524
     POSITION  LEAGUE  TEAM_NAME  PLAYER_NAME
0    attacker   23591      23591        23591
1    defender   67662      67662        67662
2  goalkeeper   24358      24358        24358
3  midfielder   78266      78266        78266
test :  0.7927


We get $\sim 80\%$ efficacity ! It is not bad considering the simplicity of the model. Now let's see what we gained with this model. 

In [74]:
train_home_player_statistics_df.loc[train_home_player_statistics_df.isna()["POSITION"],"POSITION"] = (np.array(encoding)[lin_model.predict(train_home_player_statistics_df.iloc[:,4:].replace({np.nan:0.0}))])[train_home_player_statistics_df.isna()["POSITION"]]
train_away_player_statistics_df.loc[train_away_player_statistics_df.isna()["POSITION"],"POSITION"] = (np.array(encoding)[lin_model.predict(train_away_player_statistics_df.iloc[:,4:].replace({np.nan:0.0}))])[train_away_player_statistics_df.isna()["POSITION"]]

In [75]:
print(train_home_player_statistics_df.isna()["POSITION"].any())
print(train_away_player_statistics_df.isna()["POSITION"].any())

False
False


In [76]:
df = train_away_player_statistics_df.reset_index().groupby(["POSITION", "ID"], as_index=False).sum()
gb = df.set_index("ID").groupby("POSITION")
positions = ["attacker", "goalkeeper", "midfielder", "defender"]
m1 = np.intersect1d(gb.get_group(positions[0]).index, gb.get_group(positions[1]).index)
m2 = np.intersect1d(gb.get_group(positions[2]).index, gb.get_group(positions[3]).index)
print(np.intersect1d(m1, m2).shape[0])

10834


## What about the public test data ?

In [77]:
df = test_away_player_statistics_df.reset_index().groupby(["POSITION", "ID"], as_index=False).sum()
gb = df.set_index("ID").groupby("POSITION")
positions = ["attacker", "goalkeeper", "midfielder", "defender"]
m1 = np.intersect1d(gb.get_group(positions[0]).index, gb.get_group(positions[1]).index)
m2 = np.intersect1d(gb.get_group(positions[2]).index, gb.get_group(positions[3]).index)
print(np.intersect1d(m1, m2).shape[0])

18939


In [78]:
test_away_player_statistics_df.loc[test_away_player_statistics_df.isna()["POSITION"],"POSITION"] = (np.array(encoding)[lin_model.predict(test_away_player_statistics_df.iloc[:,1:].replace({np.nan:0.0}))])[test_away_player_statistics_df.isna()["POSITION"]]
print(test_away_player_statistics_df.isna()["POSITION"].any())

False


In [79]:
df = test_away_player_statistics_df.reset_index().groupby(["POSITION", "ID"], as_index=False).sum()
gb = df.set_index("ID").groupby("POSITION")
positions = ["attacker", "goalkeeper", "midfielder", "defender"]
m1 = np.intersect1d(gb.get_group(positions[0]).index, gb.get_group(positions[1]).index)
m2 = np.intersect1d(gb.get_group(positions[2]).index, gb.get_group(positions[3]).index)
print(np.intersect1d(m1, m2).shape[0])

23943


We get back 5k over the 24k matches !

In [86]:
import pickle
pickle.dump(lin_model, open("pos_model", "wb"))
new_model = pickle.load(open("pos_model", "rb"))

## Feature extraction

In [67]:
useless_fts = []
df = train_away_player_statistics_df.reset_index().groupby(["POSITION", "ID"], as_index=False).sum()
gb = df.set_index("ID").groupby("POSITION")
positions = ["attacker", "goalkeeper", "midfielder", "defender"]
for pos in positions:
    posdf = gb.get_group(pos)
    for key in posdf.keys():
        if((posdf[key] == 0.0).all()):
            useless_fts.append(key)

print(np.unique(useless_fts))

['PLAYER_CAPTAIN_5_last_match_average' 'PLAYER_CAPTAIN_5_last_match_std'
 'PLAYER_CAPTAIN_season_average' 'PLAYER_CAPTAIN_season_std'
 'PLAYER_CLEARANCE_OFFLINE_5_last_match_average'
 'PLAYER_CLEARANCE_OFFLINE_5_last_match_std'
 'PLAYER_CLEARANCE_OFFLINE_5_last_match_sum'
 'PLAYER_LONG_BALLS_5_last_match_average'
 'PLAYER_LONG_BALLS_5_last_match_std' 'PLAYER_LONG_BALLS_5_last_match_sum'
 'PLAYER_LONG_BALLS_WON_5_last_match_average'
 'PLAYER_LONG_BALLS_WON_5_last_match_std'
 'PLAYER_LONG_BALLS_WON_5_last_match_sum'
 'PLAYER_LONG_BALLS_WON_season_average' 'PLAYER_LONG_BALLS_WON_season_std'
 'PLAYER_LONG_BALLS_WON_season_sum' 'PLAYER_LONG_BALLS_season_average'
 'PLAYER_LONG_BALLS_season_std' 'PLAYER_LONG_BALLS_season_sum'
 'PLAYER_PENALTIES_SAVED_5_last_match_average'
 'PLAYER_PENALTIES_SAVED_5_last_match_std'
 'PLAYER_PENALTIES_SAVED_5_last_match_sum'
 'PLAYER_PENALTIES_SAVED_season_average'
 'PLAYER_PENALTIES_SAVED_season_std' 'PLAYER_PENALTIES_SAVED_season_sum'
 'PLAYER_PENALTIES_WON_5