# Importing the necessary libraries and the dataset

In [1]:
import pandas as pd
import re

In [2]:
df =pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/CSV_progetto/club_games.csv')
df.shape

(130432, 11)

# Checking the types of the columns

In [3]:
df.dtypes

game_id                    int64
club_id                    int64
own_goals                  int64
own_position             float64
own_manager_name          object
opponent_id                int64
opponent_goals             int64
opponent_position        float64
opponent_manager_name     object
hosting                   object
is_win                     int64
dtype: object

# Creating a function to filter only the europian characters

In [4]:
def contains_only_european_chars(s):
    return re.match(r'^[a-zA-Z0-9\s\.,;:!?\-\'\"()€£\u00C0-\u017F]+$', s) is not None


# Changing the type of the columns, than apply the function to filter only the european characters

In [5]:
df['own_manager_name'] = df['own_manager_name'].astype('str')
df['opponent_manager_name'] = df['opponent_manager_name'].astype('str')
columns_to_filter = ['own_manager_name','opponent_manager_name']
for column in columns_to_filter:
    df_filtered = df[df[column].apply(contains_only_european_chars)]

In [6]:
df_filtered.shape

(130430, 11)

# Checking the columns with missing values

In [7]:
df_filtered.isna().sum()

game_id                      0
club_id                      0
own_goals                    0
own_position             38908
own_manager_name             0
opponent_id                  0
opponent_goals               0
opponent_position        38908
opponent_manager_name        0
hosting                      0
is_win                       0
dtype: int64

# Dropping the columns with missing values

In [8]:
df_filtered = df_filtered.copy()
df_filtered.drop(columns=['own_position','opponent_position'], inplace=True)


In [9]:
df_filtered

Unnamed: 0,game_id,club_id,own_goals,own_manager_name,opponent_id,opponent_goals,opponent_manager_name,hosting,is_win
0,2221751,431,1,Lutz Göttling,60,2,Christian Streich,Home,0
1,2221755,83,3,Ralph Hasenhüttl,4795,0,Tomas Oral,Home,1
2,2222597,3725,2,Stanislav Cherchesov,232,1,Unai Emery,Home,1
3,2222627,2696,0,Andrey Kobelev,4128,2,Rustem Khuzin,Home,0
4,2222658,2410,0,Leonid Slutski,121,2,Dan Petrescu,Home,0
...,...,...,...,...,...,...,...,...,...
130427,4126931,20923,2,Sergey Perednya,12065,1,Aleksandr Gorbachev,Away,1
130428,4148237,920,0,Ruaidhri Higgins,10481,1,Milic Curcic,Away,0
130429,4171268,681,2,Imanol Alguacil,409,0,Gerhard Struber,Away,1
130430,4148252,144,3,Sergej Jakirovic,10532,1,Dan Brimsvík,Away,1


In [10]:
df_filtered.dtypes

game_id                   int64
club_id                   int64
own_goals                 int64
own_manager_name         object
opponent_id               int64
opponent_goals            int64
opponent_manager_name    object
hosting                  object
is_win                    int64
dtype: object

# Changing the type of the columns

In [11]:
df_filtered.loc[:,'own_manager_name'] = df_filtered['own_manager_name'].astype('str')
df_filtered.loc[:,'opponent_manager_name'] = df_filtered['opponent_manager_name'].astype('str')
df_filtered.loc[:,'hosting'] = df_filtered['hosting'].astype('str')

# Checking the dataset to match the clubs dataset

In [12]:
df_filtered_club = pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/clubs.csv')
df_filtered_filtered = df_filtered[df_filtered['club_id'].isin(df_filtered_club['club_id'])]
df_filtered_filtered = df_filtered_filtered[df_filtered_filtered['opponent_id'].isin(df_filtered_club['club_id'])]
df_filtered_filtered

Unnamed: 0,game_id,club_id,own_goals,own_manager_name,opponent_id,opponent_goals,opponent_manager_name,hosting,is_win
2,2222597,3725,2,Stanislav Cherchesov,232,1,Unai Emery,Home,1
3,2222627,2696,0,Andrey Kobelev,4128,2,Rustem Khuzin,Home,0
4,2222658,2410,0,Leonid Slutski,121,2,Dan Petrescu,Home,0
6,2222683,2696,0,Andrey Kobelev,12438,1,Gadzhi Gadzhiev,Home,0
7,2222685,2698,0,Kurban Berdyev,232,1,Valeriy Karpin,Home,0
...,...,...,...,...,...,...,...,...,...
130423,4095969,39,1,Bo Svensson,89,4,Urs Fischer,Away,0
130424,4098048,383,3,Peter Bosz,317,0,Joseph Oosting,Away,1
130425,4120903,370,0,Barry Robson,2553,2,Derek McInnes,Away,0
130426,4126817,126,3,İlhan Palut,449,2,Nenad Bjelica,Away,1


# Saving the dataset

In [13]:
df_filtered_filtered.to_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/club_games.csv',index=False)