# Importing the necessary libraries and the dataset

In [1]:
import pandas as pd
import re

In [2]:
df =pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/CSV_progetto/games.csv')
df.shape

(65216, 23)

# Checking the types of the columns

In [3]:
df.dtypes

game_id                     int64
competition_id             object
season                      int64
round                      object
date                       object
home_club_id                int64
away_club_id                int64
home_club_goals             int64
away_club_goals             int64
home_club_position        float64
away_club_position        float64
home_club_manager_name     object
away_club_manager_name     object
stadium                    object
attendance                float64
referee                    object
url                        object
home_club_formation        object
away_club_formation        object
home_club_name             object
away_club_name             object
aggregate                  object
competition_type           object
dtype: object

In [4]:
df['date'] = pd.to_datetime(df['date'])

# Dropping the columns that are not useful for the database

In [5]:
df.drop(columns=['home_club_position','away_club_position','attendance','home_club_formation','away_club_formation','url'], inplace=True)

# Creating a function to filter only the europian characters

In [6]:
def contains_only_european_chars(s):
    return re.match(r'^[a-zA-Z0-9\s\.,;:!?\-\'\"()€£\u00C0-\u017F]+$', s) is not None

# Dropping the rows with NaN values

In [7]:
df = df.dropna()

# Applying the function to the columns that are String type

In [8]:
columns_to_filter = ['home_club_manager_name', 'away_club_manager_name', 'stadium','referee','home_club_name','away_club_name']
df[columns_to_filter] =df[columns_to_filter].astype('str')
for column in columns_to_filter:
    df_filtered = df[df[column].apply(contains_only_european_chars)]    

In [9]:
df_filtered.shape

(51055, 17)

In [10]:
df_filtered

Unnamed: 0,game_id,competition_id,season,round,date,home_club_id,away_club_id,home_club_goals,away_club_goals,home_club_manager_name,away_club_manager_name,stadium,referee,home_club_name,away_club_name,aggregate,competition_type
0,2222597,RU1,2012,6. Matchday,2012-08-25,3725,232,2,1,Stanislav Cherchesov,Unai Emery,Akhmat-Arena,Vladislav Bezborodov,RFK Akhmat Grozny,FK Spartak Moskva,2:1,domestic_league
1,2222627,RU1,2012,5. Matchday,2012-08-20,2696,4128,0,2,Andrey Kobelev,Rustem Khuzin,Metallurg,Sergey Ivanov,PFK Krylya Sovetov Samara,Amkar Perm,0:2,domestic_league
2,2222658,RU1,2012,10. Matchday,2012-09-30,2410,121,0,2,Leonid Slutski,Dan Petrescu,Arena Khimki,Sergey Karasev,PFK CSKA Moskva,FK Dinamo Moskva,0:2,domestic_league
3,2222664,RU1,2012,8. Matchday,2012-09-15,932,2698,1,0,Slaven Bilic,Kurban Berdyev,RZD Arena,Sergey Karasev,"Футбольный клуб ""Локомотив"" Москва",FC Rubin Kazan,1:0,domestic_league
4,2222683,RU1,2012,12. Matchday,2012-10-22,2696,12438,0,1,Andrey Kobelev,Gadzhi Gadzhiev,Metallurg,Timur Arslanbekov,PFK Krylya Sovetov Samara,Volga Nizhniy Novgorod (- 2016),0:1,domestic_league
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65165,3852607,NL1,2022,12. Matchday,2022-10-30,383,467,3,0,Ruud van Nistelrooy,Rogier Meijer,Philips Stadion,Martin van den Kerkhof,Eindhovense Voetbalvereniging Philips Sport Ve...,Nijmegen Eendracht Combinatie,3:0,domestic_league
65166,2589097,NL1,2015,23. Matchday,2016-02-14,467,383,0,3,Ernest Faber,Phillip Cocu,Goffertstadion,Kevin Blom,Nijmegen Eendracht Combinatie,Eindhovense Voetbalvereniging Philips Sport Ve...,0:3,domestic_league
65167,2700827,NL1,2016,23. Matchday,2017-02-18,383,467,3,1,Phillip Cocu,Peter Hyballa,Philips Stadion,Jochem Kamphuis,Eindhovense Voetbalvereniging Philips Sport Ve...,Nijmegen Eendracht Combinatie,3:1,domestic_league
65168,3852741,NL1,2022,27. Matchday,2023-04-01,467,383,2,4,Rogier Meijer,Ruud van Nistelrooy,Goffertstadion,Edwin van de Graaf,Nijmegen Eendracht Combinatie,Eindhovense Voetbalvereniging Philips Sport Ve...,2:4,domestic_league


# Checking the dataset to match the clubs and the players dataset

In [11]:
filtered_clubs = pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/clubs.csv')
df_filtered_filtered = df_filtered[df_filtered['home_club_id'].isin(filtered_clubs['club_id'])]
df_filtered_filtered = df_filtered_filtered[df_filtered_filtered['away_club_id'].isin(filtered_clubs['club_id'])]
df_filtered_filtered

Unnamed: 0,game_id,competition_id,season,round,date,home_club_id,away_club_id,home_club_goals,away_club_goals,home_club_manager_name,away_club_manager_name,stadium,referee,home_club_name,away_club_name,aggregate,competition_type
0,2222597,RU1,2012,6. Matchday,2012-08-25,3725,232,2,1,Stanislav Cherchesov,Unai Emery,Akhmat-Arena,Vladislav Bezborodov,RFK Akhmat Grozny,FK Spartak Moskva,2:1,domestic_league
1,2222627,RU1,2012,5. Matchday,2012-08-20,2696,4128,0,2,Andrey Kobelev,Rustem Khuzin,Metallurg,Sergey Ivanov,PFK Krylya Sovetov Samara,Amkar Perm,0:2,domestic_league
2,2222658,RU1,2012,10. Matchday,2012-09-30,2410,121,0,2,Leonid Slutski,Dan Petrescu,Arena Khimki,Sergey Karasev,PFK CSKA Moskva,FK Dinamo Moskva,0:2,domestic_league
4,2222683,RU1,2012,12. Matchday,2012-10-22,2696,12438,0,1,Andrey Kobelev,Gadzhi Gadzhiev,Metallurg,Timur Arslanbekov,PFK Krylya Sovetov Samara,Volga Nizhniy Novgorod (- 2016),0:1,domestic_league
5,2222685,RU1,2012,19. Matchday,2012-12-10,2698,232,0,1,Kurban Berdyev,Valeriy Karpin,Central Stadium Kazan,Vladislav Bezborodov,FC Rubin Kazan,FK Spartak Moskva,0:1,domestic_league
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65165,3852607,NL1,2022,12. Matchday,2022-10-30,383,467,3,0,Ruud van Nistelrooy,Rogier Meijer,Philips Stadion,Martin van den Kerkhof,Eindhovense Voetbalvereniging Philips Sport Ve...,Nijmegen Eendracht Combinatie,3:0,domestic_league
65166,2589097,NL1,2015,23. Matchday,2016-02-14,467,383,0,3,Ernest Faber,Phillip Cocu,Goffertstadion,Kevin Blom,Nijmegen Eendracht Combinatie,Eindhovense Voetbalvereniging Philips Sport Ve...,0:3,domestic_league
65167,2700827,NL1,2016,23. Matchday,2017-02-18,383,467,3,1,Phillip Cocu,Peter Hyballa,Philips Stadion,Jochem Kamphuis,Eindhovense Voetbalvereniging Philips Sport Ve...,Nijmegen Eendracht Combinatie,3:1,domestic_league
65168,3852741,NL1,2022,27. Matchday,2023-04-01,467,383,2,4,Rogier Meijer,Ruud van Nistelrooy,Goffertstadion,Edwin van de Graaf,Nijmegen Eendracht Combinatie,Eindhovense Voetbalvereniging Philips Sport Ve...,2:4,domestic_league


# Saving the dataset

In [12]:
df_filtered_filtered.to_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/games.csv',index=False)