# Importing the libraries and the dataset

In [15]:
import pandas as pd
import re

In [16]:
df =pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/CSV_progetto/clubs.csv')
df.shape

(426, 16)

# Checking the types of the columns

In [17]:
df.dtypes

club_id                      int64
club_code                   object
name                        object
domestic_competition_id     object
total_market_value         float64
squad_size                   int64
average_age                float64
foreigners_number            int64
foreigners_percentage      float64
national_team_players        int64
stadium_name                object
stadium_seats                int64
net_transfer_record         object
coach_name                 float64
last_season                  int64
url                         object
dtype: object

# Dropping the columns that are not useful for the database

In [18]:
df.drop(columns=['club_code','squad_size','average_age','foreigners_number','foreigners_percentage','national_team_players','net_transfer_record','url','total_market_value'],inplace=True)

# Creating a function to check if a string contains only european characters and another function to check if a string contains cyrillic characters

In [19]:
def contains_only_european_chars(s):
    return re.match(r'^[a-zA-Z0-9\s\.,;:!?\-\'\"()€£\u00C0-\u017F]+$', s) is not None
def contains_cyrillic_chars(s):
    return bool(re.search(r'[\u0400-\u04FF]', s))



In [20]:
df.dtypes

club_id                      int64
name                        object
domestic_competition_id     object
stadium_name                object
stadium_seats                int64
coach_name                 float64
last_season                  int64
dtype: object

# Converting the types of some selected columns to string

In [21]:
columns_to_filter = ['name','domestic_competition_id','stadium_name']
df[columns_to_filter] = df[columns_to_filter].astype('str')

# Filtering and dropping the columns that contain cyrillic characters

In [22]:
for column in columns_to_filter:
    df = df[~df[column].apply(contains_cyrillic_chars)]

# Filtering the columns that contain only european characters

In [23]:
for column in columns_to_filter:
    df_filtered = df[df[column].apply(contains_only_european_chars)] 

In [24]:
df_filtered.shape

(420, 7)

In [25]:
df_filtered

Unnamed: 0,club_id,name,domestic_competition_id,stadium_name,stadium_seats,coach_name,last_season
0,105,Sportverein Darmstadt 1898 e. V.,L1,Merck-Stadion am Böllenfalltor,17810,,2023
1,11127,FK Ural Yekaterinburg,RU1,Yekaterinburg Arena,23000,,2023
2,114,Beşiktaş Jimnastik Kulübü,TR1,Tüpraş Stadyumu,42590,,2023
3,12,Associazione Sportiva Roma,IT1,Olimpico di Roma,73261,,2023
4,148,Tottenham Hotspur Football Club,GB1,Tottenham Hotspur Stadium,62062,,2023
...,...,...,...,...,...,...,...
421,68608,CF Os Belenenses,PO1,Estádio do Restelo,19980,,2017
422,724,Football Club Volendam,NL1,Kras Stadion,7384,,2023
423,800,Atalanta Bergamasca Calcio S.p.a.,IT1,Gewiss Stadium,21747,,2023
424,979,Moreirense Futebol Clube,PO1,Estádio C. J. de Almeida Freitas,6153,,2023


# Checking the amount of missing values

In [26]:
df_filtered.isna().sum()

club_id                      0
name                         0
domestic_competition_id      0
stadium_name                 0
stadium_seats                0
coach_name                 420
last_season                  0
dtype: int64

# Dropping the column that contain missing values

In [27]:
df_filtered.drop('coach_name',axis=1)

Unnamed: 0,club_id,name,domestic_competition_id,stadium_name,stadium_seats,last_season
0,105,Sportverein Darmstadt 1898 e. V.,L1,Merck-Stadion am Böllenfalltor,17810,2023
1,11127,FK Ural Yekaterinburg,RU1,Yekaterinburg Arena,23000,2023
2,114,Beşiktaş Jimnastik Kulübü,TR1,Tüpraş Stadyumu,42590,2023
3,12,Associazione Sportiva Roma,IT1,Olimpico di Roma,73261,2023
4,148,Tottenham Hotspur Football Club,GB1,Tottenham Hotspur Stadium,62062,2023
...,...,...,...,...,...,...
421,68608,CF Os Belenenses,PO1,Estádio do Restelo,19980,2017
422,724,Football Club Volendam,NL1,Kras Stadion,7384,2023
423,800,Atalanta Bergamasca Calcio S.p.a.,IT1,Gewiss Stadium,21747,2023
424,979,Moreirense Futebol Clube,PO1,Estádio C. J. de Almeida Freitas,6153,2023


In [28]:
df_filtered.shape

(420, 7)

# Saving the filtered dataset

In [29]:
df_filtered.to_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/clubs.csv',index=False)