# Importing the necessary libraries and the dataset

In [1]:
import pandas as pd
import re

In [2]:
df =pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/CSV_progetto/players.csv')
df.shape

(30302, 23)

In [3]:
df.dtypes

player_id                                 int64
first_name                               object
last_name                                object
name                                     object
last_season                               int64
current_club_id                           int64
player_code                              object
country_of_birth                         object
city_of_birth                            object
country_of_citizenship                   object
date_of_birth                            object
sub_position                             object
position                                 object
foot                                     object
height_in_cm                            float64
market_value_in_eur                     float64
highest_market_value_in_eur             float64
contract_expiration_date                 object
agent_name                               object
image_url                                object
url                                     

# Creating a function to check if a string contains only european characters

In [4]:
def contains_only_european_chars(s):
    return re.match(r'^[a-zA-Z0-9\s\.,;:!?\-\'\"()€£\u00C0-\u017F]+$', s) is not None

# Dropping some not useful columns

In [5]:
df.drop(columns='first_name',inplace=True)

In [6]:
df.drop(columns='last_name',inplace=True)

In [7]:
df.drop(columns='player_code',inplace=True)

In [8]:
df.drop(columns='url',inplace=True)

In [9]:
df.drop(columns='current_club_domestic_competition_id',inplace=True)

In [10]:
df.drop(columns='contract_expiration_date',inplace=True)

In [11]:
df.shape

(30302, 17)

In [12]:
df.dtypes

player_id                        int64
name                            object
last_season                      int64
current_club_id                  int64
country_of_birth                object
city_of_birth                   object
country_of_citizenship          object
date_of_birth                   object
sub_position                    object
position                        object
foot                            object
height_in_cm                   float64
market_value_in_eur            float64
highest_market_value_in_eur    float64
agent_name                      object
image_url                       object
current_club_name               object
dtype: object

# Converting the selected columns to string, then applying the function to check if they contain only european characters

In [13]:
columns_to_filter = ['name','country_of_citizenship','country_of_birth','city_of_birth','sub_position','position','foot','agent_name','current_club_name']
df[columns_to_filter] = df[columns_to_filter].astype('str')
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])

In [14]:
df_filtered = df[df['name'].apply(contains_only_european_chars)]
df_filtered = df[df['country_of_citizenship'].apply(contains_only_european_chars)]
df_filtered = df[df['country_of_birth'].apply(contains_only_european_chars)]
df_filtered = df[df['agent_name'].apply(contains_only_european_chars)]
df_filtered = df[df['city_of_birth'].apply(contains_only_european_chars)]
df_filtered = df[df['sub_position'].apply(contains_only_european_chars)]
df_filtered = df[df['position'].apply(contains_only_european_chars)]
df_filtered = df[df['foot'].apply(contains_only_european_chars)]
df_filtered = df[df['current_club_name'].apply(contains_only_european_chars)]

In [15]:
df_filtered

Unnamed: 0,player_id,name,last_season,current_club_id,country_of_birth,city_of_birth,country_of_citizenship,date_of_birth,sub_position,position,foot,height_in_cm,market_value_in_eur,highest_market_value_in_eur,agent_name,image_url,current_club_name
0,598,Timo Hildebrand,2014,24,Germany,Worms,Germany,1979-04-05,Goalkeeper,Goalkeeper,,,,10000000.0,,https://img.a.transfermarkt.technology/portrai...,Eintracht Frankfurt
1,670,Martin Petrov,2012,714,Bulgaria,Vratsa,Bulgaria,1979-01-15,Left Winger,Attack,,,,12000000.0,IFM,https://img.a.transfermarkt.technology/portrai...,RCD Espanyol Barcelona
2,1323,Martin Amedick,2012,24,Germany,Paderborn,Germany,1982-09-06,Centre-Back,Defender,,,,2750000.0,,https://img.a.transfermarkt.technology/portrai...,Eintracht Frankfurt
3,3195,Jermaine Pennant,2013,512,England,Nottingham,England,1983-01-15,Right Winger,Attack,right,173.0,,10500000.0,Andrew Sky,https://img.a.transfermarkt.technology/portrai...,Stoke City
4,3259,Damien Duff,2013,931,Ireland,Ballyboden,Ireland,1979-03-02,Right Midfield,Midfield,left,177.0,,17000000.0,,https://img.a.transfermarkt.technology/portrai...,Fulham FC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30297,371851,Jaka Bijol,2023,410,Slovenia,Vuzenica,Slovenia,1999-02-05,Centre-Back,Defender,right,190.0,10000000.0,10000000.0,MSC GROUP,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio
30298,537171,Semuel Pizzignacco,2018,410,Italy,Monfalcone,Italy,2001-09-01,Goalkeeper,Goalkeeper,right,188.0,325000.0,350000.0,,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio
30299,586756,Festy Ebosele,2023,410,Ireland,"Enniscorthy, Wexford",Ireland,2002-08-02,Right Midfield,Midfield,right,180.0,4500000.0,4500000.0,The Kulture Group,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio
30300,704692,Nicolò Cocetta,2022,410,Italy,San Daniele del Friuli,Italy,2003-12-19,Centre-Back,Defender,,,75000.0,75000.0,,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio


# Checking if the dataset matches the clubs dataset

In [16]:
df_filtered_club = pd.read_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/clubs.csv')
df_filtered = df_filtered[df_filtered['current_club_id'].isin(df_filtered_club['club_id'])]
df_filtered

Unnamed: 0,player_id,name,last_season,current_club_id,country_of_birth,city_of_birth,country_of_citizenship,date_of_birth,sub_position,position,foot,height_in_cm,market_value_in_eur,highest_market_value_in_eur,agent_name,image_url,current_club_name
0,598,Timo Hildebrand,2014,24,Germany,Worms,Germany,1979-04-05,Goalkeeper,Goalkeeper,,,,10000000.0,,https://img.a.transfermarkt.technology/portrai...,Eintracht Frankfurt
1,670,Martin Petrov,2012,714,Bulgaria,Vratsa,Bulgaria,1979-01-15,Left Winger,Attack,,,,12000000.0,IFM,https://img.a.transfermarkt.technology/portrai...,RCD Espanyol Barcelona
2,1323,Martin Amedick,2012,24,Germany,Paderborn,Germany,1982-09-06,Centre-Back,Defender,,,,2750000.0,,https://img.a.transfermarkt.technology/portrai...,Eintracht Frankfurt
3,3195,Jermaine Pennant,2013,512,England,Nottingham,England,1983-01-15,Right Winger,Attack,right,173.0,,10500000.0,Andrew Sky,https://img.a.transfermarkt.technology/portrai...,Stoke City
4,3259,Damien Duff,2013,931,Ireland,Ballyboden,Ireland,1979-03-02,Right Midfield,Midfield,left,177.0,,17000000.0,,https://img.a.transfermarkt.technology/portrai...,Fulham FC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30297,371851,Jaka Bijol,2023,410,Slovenia,Vuzenica,Slovenia,1999-02-05,Centre-Back,Defender,right,190.0,10000000.0,10000000.0,MSC GROUP,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio
30298,537171,Semuel Pizzignacco,2018,410,Italy,Monfalcone,Italy,2001-09-01,Goalkeeper,Goalkeeper,right,188.0,325000.0,350000.0,,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio
30299,586756,Festy Ebosele,2023,410,Ireland,"Enniscorthy, Wexford",Ireland,2002-08-02,Right Midfield,Midfield,right,180.0,4500000.0,4500000.0,The Kulture Group,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio
30300,704692,Nicolò Cocetta,2022,410,Italy,San Daniele del Friuli,Italy,2003-12-19,Centre-Back,Defender,,,75000.0,75000.0,,https://img.a.transfermarkt.technology/portrai...,Udinese Calcio


# Saving the filtered dataset

In [17]:
df_filtered.to_csv('/home/lorenzo/Documents/3-TERZO-ANNO/IUMTWEB/Manipulated_data/players.csv',index=False)