In [1]:
import pandas as pd 
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import impute
from sklearn.preprocessing import LabelEncoder
import os
from copy import copy

In [2]:
MatchEventInfo = pd.read_parquet('../Aggregation/MatchEventInfo.parquet')
MatchVotesInfo = pd.read_parquet('../Aggregation/MatchVotesInfo.parquet')
MatchTournamentInfo = pd.read_parquet('../Aggregation/MatchTournamentInfo.parquet')
MatchSeasonInfo = pd.read_parquet('../Aggregation/MatchSeasonInfo.parquet')
MatchVenueInfo = pd.read_parquet('../Aggregation/MatchVenueInfo.parquet')
MatchAwayTeamInfo = pd.read_parquet('../Aggregation/MatchAwayTeamInfo.parquet')
MatchAwayScoreInfo = pd.read_parquet('../Aggregation/MatchAwayScoreInfo.parquet')
MatchRoundInfo = pd.read_parquet('../Aggregation/MatchRoundInfo.parquet')
MatchHomeScoreInfo = pd.read_parquet('../Aggregation/MatchHomeScoreInfo.parquet')
MatchHomeTeamInfo = pd.read_parquet('../Aggregation/MatchHomeTeamInfo.parquet')
PowerInfo = pd.read_parquet('../Aggregation/PowerInfo.parquet')
GameInfo = pd.read_parquet('../Aggregation/GameInfo.parquet')
OddsInfo = pd.read_parquet('../Aggregation/OddsInfo.parquet')
TimeInfo = pd.read_parquet('../Aggregation/TimeInfo.parquet')
PeriodInfo = pd.read_parquet('../Aggregation/PeriodInfo.parquet')

In [3]:
data_frames = {'MatchEventInfo':MatchEventInfo,'MatchVotesInfo':MatchVotesInfo,
               'MatchTournamentInfo':MatchTournamentInfo, 'MatchRoundInfo':MatchRoundInfo,
               'MatchSeasonInfo':MatchSeasonInfo,'MatchVenueInfo':MatchVenueInfo,
               'MatchAwayTeamInfo':MatchAwayTeamInfo, 'MatchAwayScoreInfo':MatchAwayScoreInfo,
              'MatchHomeScoreInfo':MatchHomeScoreInfo, 'MatchHomeTeamInfo':MatchHomeTeamInfo,
               'PowerInfo':PowerInfo,'GameInfo':GameInfo,'TimeInfo':TimeInfo,
               'PeriodInfo':PeriodInfo,'OddsInfo':OddsInfo}

In [4]:
for key,value in data_frames.items():
    print(f'Shape Of {key} is: {value.shape}')



Shape Of MatchEventInfo is: (9319, 10)
Shape Of MatchVotesInfo is: (9319, 3)
Shape Of MatchTournamentInfo is: (9319, 16)
Shape Of MatchRoundInfo is: (5790, 5)
Shape Of MatchSeasonInfo is: (9319, 4)
Shape Of MatchVenueInfo is: (9286, 5)
Shape Of MatchAwayTeamInfo is: (6143, 18)
Shape Of MatchAwayScoreInfo is: (9319, 14)
Shape Of MatchHomeScoreInfo is: (9319, 14)
Shape Of MatchHomeTeamInfo is: (6670, 18)
Shape Of PowerInfo is: (135470, 5)
Shape Of GameInfo is: (749517, 13)
Shape Of TimeInfo is: (9319, 7)
Shape Of PeriodInfo is: (401281, 13)
Shape Of OddsInfo is: (15956, 11)


In [5]:
for key,value in data_frames.items():
    print(f'{key} Null Percentage')
    null_p = (value.isnull().sum() / value.shape[0]) * 100
    print(null_p)
    print('===' * 10)

MatchEventInfo Null Percentage
match_id                 0.000000
first_to_serve          37.729370
home_team_seed          88.185428
away_team_seed          88.078120
custom_id                0.000000
winner_code             19.615839
default_period_count     0.000000
start_datetime           0.000000
match_slug               0.000000
final_result_only        0.000000
dtype: float64
MatchVotesInfo Null Percentage
match_id     0.0
home_vote    0.0
away_vote    0.0
dtype: float64
MatchTournamentInfo Null Percentage
match_id                             0.000000
tournament_id                        0.000000
tournament_name                      0.000000
tournament_slug                      0.000000
tournament_unique_id               100.000000
tournament_category_name             0.000000
tournament_category_slug             0.000000
user_count                           0.000000
ground_type                          0.729692
tennis_points                       88.099582
has_event_player_stat

In [6]:
def column_remover(df:pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a dataframe and removes columns that are fully null.

    Parameters:
        df (DataFrame): The dataframe to be cleaned.
        
    Returns:
        DataFrame: A new dataframe without columns that have more than 90 percent null values.
    """
    # Identify columns which has more than 80 percent null values
    columns_to_remove = df.columns[df.isnull().mean(axis=0) == 1] 
    
    # Drop columns which are in columns_to_remove 
    new_df = df.drop(columns_to_remove, axis =1)
    
    return new_df        

In [7]:
def row_remover(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a dataframe and removes rows that have 50 percent or more null values.

    Parameters:
        df (DataFrame): The dataframe to be cleaned.
        
    Returns:
        DataFrame: A new dataframe without rows that have 50 percent or more null values.
    """
    # Calculate the percentage of null values in each row
    null_percentages = df.isnull().mean(axis=1)

    # Identify rows that have 50 percent or more null values
    rows_to_drop = df.index[null_percentages >= 0.5]

    # Calculate the percentage of rows to drop relative to the entire dataframe
    percent_of_dropping_candidate = len(rows_to_drop) / df.shape[0]

    # Print the number of rows to drop for debugging
    print(f'Rows to drop: {len(rows_to_drop)}')
    print(percent_of_dropping_candidate)
    # Remove rows if the total number of rows to drop is less than 5 percent of data
    if percent_of_dropping_candidate < 0.05:
        print('True')
        new_df = df.drop(index=rows_to_drop)
        return new_df
    else:
        print('False')
        return df



In [8]:
def knn_filler(df:pd.DataFrame, k = 5) -> pd.DataFrame:
    """
    This function takes a dataframe and fills the null values using the KNN algorithm.

    Parameters:
        df (DataFrame): The dataframe to be cleaned.
        
    Returns:
        DataFrame: A new dataframe with its null values filled using the KNN algorithm.
    """
# Prepare data to use Knn
    new_df = df.copy()
    for column in new_df.columns:
        if new_df[column].dtype == object or new_df[column].isnull().any():  # Check for object type or presence of NaNs
            le = LabelEncoder()
            new_df[column] = new_df[column].astype(str)  # Convert all values to string 
            new_df[column] = le.fit_transform(df[column])
        
    # Create Knn imputer
    knn_imputer = impute.KNNImputer(n_neighbors= k)
    new_df = pd.DataFrame(knn_imputer.fit_transform(new_df, ), columns=new_df.columns, index=new_df.index)
    return new_df.round(3)

### Cleaning MatchEventInfo

In [9]:
MatchEventInfo

Unnamed: 0,match_id,first_to_serve,home_team_seed,away_team_seed,custom_id,winner_code,default_period_count,start_datetime,match_slug,final_result_only
0,12260075,1.0,,30,LOfsRUhc,2.0,3,1714499700,lehecka-nadal,True
1,12260076,1.0,1,16,FyLsvGHb,1.0,3,1714480500,sinner-khachanov,True
2,12260077,1.0,23,2,QCtsytke,2.0,3,1714484700,alcaraz-struff,True
3,12260078,1.0,7,2,fKRsytzc,1.0,3,1714564800,alcaraz-rublev,False
4,12260080,2.0,21,4,nTxsbvNb,1.0,3,1714488300,cerundolo-zverev,True
...,...,...,...,...,...,...,...,...,...,...
9314,12384975,2.0,,,iwNsArQ,1.0,3,1717238700,serban-bandecchi,True
9315,12385017,2.0,,,onYsGDbc,1.0,3,1717240500,ortenzi-pedretti,True
9316,12385869,,,,nDQsLOub,2.0,3,1717214700,masabayashi-fukuda,True
9317,12385873,,,,IbIsRmHd,2.0,3,1717224900,sugaya-uchida,True


In [10]:

# First To serve column has about 38 percent null value, 
# since there is no logic and it is a code so Mode is being used.
first_to_serve_mode = MatchEventInfo['first_to_serve'].mode()[0]
MatchEventInfo.fillna({'first_to_serve':first_to_serve_mode}, inplace=True)

In [11]:
# knn will be used to fill home_team_seed, away_team_seed and winner code
imputed_df = knn_filler(MatchEventInfo, k=10)
MatchEventInfo['home_team_seed'] = imputed_df['home_team_seed']
MatchEventInfo['away_team_seed'] = imputed_df['away_team_seed']
MatchEventInfo['winner_code'] = imputed_df['winner_code']

In [12]:
MatchEventInfo.isnull().sum()
# Now the Table is Clean

match_id                0
first_to_serve          0
home_team_seed          0
away_team_seed          0
custom_id               0
winner_code             0
default_period_count    0
start_datetime          0
match_slug              0
final_result_only       0
dtype: int64

### Cleaning MatchTournamentInfo

In [13]:
MatchTournamentInfo = column_remover(MatchTournamentInfo)
MatchTournamentInfo

Unnamed: 0,match_id,tournament_id,tournament_name,tournament_slug,tournament_category_name,tournament_category_slug,user_count,ground_type,tennis_points,has_event_player_statistics,crowd_sourcing_enabled,has_performance_graph_feature,display_inverse_home_away_teams,priority,competition_type
0,12260075,129164,"Madrid, Spain",madrid-spain,ATP,atp,12763,Red clay,1000.0,False,True,False,True,0,2
1,12260076,129164,"Madrid, Spain",madrid-spain,ATP,atp,12750,Red clay,1000.0,False,False,True,True,0,2
2,12260077,129164,"Madrid, Spain",madrid-spain,ATP,atp,12760,Red clay,1000.0,False,False,False,False,0,2
3,12260078,129164,"Madrid, Spain",madrid-spain,ATP,atp,12760,Red clay,1000.0,True,True,True,True,0,2
4,12260080,129164,"Madrid, Spain",madrid-spain,ATP,atp,12760,Red clay,1000.0,True,False,False,True,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9314,12384975,131083,"Troisdorf, Singles Main, W-ITF-GER-03A",troisdorf-singles-main-w-itf-ger-03a,ITF Women,itf-women,24,Red clay,,False,True,False,False,0,2
9315,12385017,131095,"Rio Claro, Singles Main, W-ITF-BRA-06A",rio-claro-singles-main-w-itf-bra-06a,ITF Women,itf-women,28,Red clay,,False,True,False,True,0,2
9316,12385869,131026,"Karuizawa, Singles Main, M-ITF-JPN-05A",karuizawa-singles-main-m-itf-jpn-05a,ITF Men,itf-men,17,Red clay,,True,False,True,True,0,2
9317,12385873,131026,"Karuizawa, Singles Main, M-ITF-JPN-05A",karuizawa-singles-main-m-itf-jpn-05a,ITF Men,itf-men,17,Red clay,,False,True,False,False,0,2


In [14]:
ground_tyep_mode = MatchTournamentInfo['ground_type'].mode()[0]
MatchTournamentInfo.fillna({'ground_type':ground_tyep_mode}, inplace=True)



In [15]:
imputed_df = knn_filler(MatchTournamentInfo,k=15)
MatchTournamentInfo['tennis_points'] = imputed_df['tennis_points']

In [16]:
MatchTournamentInfo.isnull().sum()
# Now the Table is Clean

match_id                           0
tournament_id                      0
tournament_name                    0
tournament_slug                    0
tournament_category_name           0
tournament_category_slug           0
user_count                         0
ground_type                        0
tennis_points                      0
has_event_player_statistics        0
crowd_sourcing_enabled             0
has_performance_graph_feature      0
display_inverse_home_away_teams    0
priority                           0
competition_type                   0
dtype: int64

### Cleaning MatchRoundInfo

In [17]:
MatchRoundInfo

Unnamed: 0,match_id,round_id,name,slug,cup_round_type
0,12260075,5,Round of 16,round-of-16,8.0
1,12260076,5,Round of 16,round-of-16,8.0
2,12260077,5,Round of 16,round-of-16,8.0
3,12260078,27,Quarterfinals,quarterfinals,4.0
4,12260080,5,Round of 16,round-of-16,8.0
...,...,...,...,...,...
5785,12384975,28,Semifinals,semifinals,2.0
5786,12385017,28,Semifinals,semifinals,2.0
5787,12385869,28,Semifinals,semifinals,2.0
5788,12385873,28,Semifinals,semifinals,2.0


In [18]:
MatchRoundInfo['cup_round_type'].unique()

array([ 8.,  4.,  2.,  1., 16., nan])

In [19]:
MatchRoundInfo['name'].unique()

array(['Round of 16', 'Quarterfinals', 'Semifinals', 'Final',
       'Round of 32', 'Round of 64', 'Qualification round 1',
       'Qualification round 2', 'Round of 128'], dtype=object)

In [20]:
unique_values = MatchRoundInfo[['name', 'cup_round_type']].drop_duplicates()
unique_values

Unnamed: 0,name,cup_round_type
0,Round of 16,8.0
3,Quarterfinals,4.0
9,Semifinals,2.0
11,Final,1.0
115,Round of 32,16.0
786,Round of 64,
1071,Qualification round 1,
1302,Qualification round 2,
1315,Round of 128,


In [21]:
cup_round= {'Final':1,'Round of 16	':8,'Quarterfinals':4,'Semifinals':2,
 'Round of 32':16,'Round of 64':32,'Round of 128':64,'Qualification round 2':128,
 'Qualification round 1':256 }

In [22]:
for name, value in cup_round.items():
    MatchRoundInfo.loc[MatchRoundInfo['cup_round_type'].isnull() &
                       MatchRoundInfo['name'].eq(name), 'cup_round_type'] = value

In [23]:
MatchRoundInfo.isnull().sum()
# Now the data is clean

match_id          0
round_id          0
name              0
slug              0
cup_round_type    0
dtype: int64

### Cleaning MatchAwayTeamInfo

In [24]:
MatchAwayTeamInfo

Unnamed: 0,match_id,name,slug,gender,user_count,residence,birthplace,height,weight,plays,turned_pro,current_prize,total_prize,player_id,current_rank,name_code,country,full_name
0,12260075,Lehečka J.,lehecka-jiri,M,6858,"Knezmost, Czech Republic","Mlada Boleslav, Czechia",1.85,80.0,right-handed,,763346.0,2988783.0,254742,24.0,LEH,Czech Republic,"Lehečka, Jiří"
1,12260076,Khachanov K.,khachanov-karen,M,14411,"Dubai, UAE","Moscow, Russia",1.98,87.0,right-handed,2013,877460.0,13580818.0,90080,18.0,KHA,Russia,"Khachanov, Karen"
2,12260077,Alcaraz C.,alcaraz-carlos,M,141553,"El Palmar, Murcia, Spain","El Palmar, Murcia, Spain",1.83,68.0,right-handed,2018,1590518.0,24112308.0,275923,3.0,ALC,Spain,"Alcaraz, Carlos"
3,12260078,Alcaraz C.,alcaraz-carlos,M,141553,"El Palmar, Murcia, Spain","El Palmar, Murcia, Spain",1.83,68.0,right-handed,2018,1590518.0,24112308.0,275923,3.0,ALC,Spain,"Alcaraz, Carlos"
4,12260080,Zverev A.,zverev-alexander,M,60430,"Monte Carlo, Monaco","Hamburg, Germany",1.98,90.0,right-handed,2013,2676831.0,34897595.0,57163,4.0,ZVE,Germany,"Zverev, Alexander"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6138,12384975,Șerban R.,serban-raluca-georgiana,F,832,,,,,right-handed,,43717.0,340468.0,103375,200.0,SER,Cyprus,"Serban, Raluca Georgiana"
6139,12385017,Pedretti T.,pedretti-thaisa-grana,F,453,,,,,,,2598.0,70165.0,120664,647.0,PED,Brazil,"Pedretti, Thaisa Grana"
6140,12385869,Fukuda S.,fukuda-sora,M,139,"Bradenton, FL",Japan,1.80,72.0,left-handed,,1278.0,52832.0,103913,611.0,FUK,Japan,"Fukuda, Sora"
6141,12385873,Uchida K.,uchida-kaichi,M,760,,,1.80,,right-handed,2013,10488.0,444064.0,82579,483.0,UCH,Japan,"Uchida, Kaichi"


In [25]:
MatchAwayTeamInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6143 entries, 0 to 6142
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   match_id       6143 non-null   int64  
 1   name           6143 non-null   object 
 2   slug           6143 non-null   object 
 3   gender         6133 non-null   object 
 4   user_count     6143 non-null   int64  
 5   residence      1659 non-null   object 
 6   birthplace     3379 non-null   object 
 7   height         3274 non-null   float64
 8   weight         1615 non-null   float64
 9   plays          2984 non-null   object 
 10  turned_pro     1180 non-null   object 
 11  current_prize  6073 non-null   float64
 12  total_prize    6117 non-null   float64
 13  player_id      6143 non-null   int64  
 14  current_rank   6038 non-null   float64
 15  name_code      6143 non-null   object 
 16  country        6142 non-null   object 
 17  full_name      6143 non-null   object 
dtypes: float

In [26]:
genders = {'mrva-maxim':'M','werren-paul':'M','massard-loann':'M','tucakovic-suana':'F'}
for slug, sex in genders.items():
    MatchAwayTeamInfo.loc[(MatchAwayTeamInfo['gender'].isnull()) & 
                          (MatchAwayTeamInfo['slug'].eq(slug)),'gender'] = sex


In [27]:
MatchAwayTeamInfo.loc[MatchAwayTeamInfo['country'].isnull()]
MatchAwayTeamInfo.loc[MatchAwayTeamInfo['country'].isnull(),'country'] = "Australia"

Firts we try to check if there is a record in our dataframe which has the birth place of the player that is a missing value in another record.
If the approach above fails, There is no way to fill residence and birthplace by other information in the dataframe, so we fill them with 'Unknown'.

In [28]:
null_birthplace_df = MatchAwayTeamInfo[MatchAwayTeamInfo['birthplace'].isnull()]
grouped_null_birthplace = null_birthplace_df.groupby('slug')
results = []
for slug, group in grouped_null_birthplace:
    original_group = MatchAwayTeamInfo[MatchAwayTeamInfo['slug'] == slug]
    has_non_null = original_group['birthplace'].notnull().any()
    results.append({
        'slug': slug,
        'null_birthplace_rows': len(group),
        'has_non_null_birthplace': has_non_null
    })


result_df = pd.DataFrame(results)
result_df.loc[result_df['has_non_null_birthplace'].eq(True)]



Unnamed: 0,slug,null_birthplace_rows,has_non_null_birthplace
484,lobanov-aleksandr,1,True


In [29]:
MatchAwayTeamInfo.loc[MatchAwayTeamInfo['slug'].eq('lobanov-aleksandr'),'birthplace'] = 'Sochi'

In [30]:
MatchAwayTeamInfo.fillna({'residence':'Unknown', 'birthplace':'Unknown'}, inplace=True)
MatchAwayTeamInfo

Unnamed: 0,match_id,name,slug,gender,user_count,residence,birthplace,height,weight,plays,turned_pro,current_prize,total_prize,player_id,current_rank,name_code,country,full_name
0,12260075,Lehečka J.,lehecka-jiri,M,6858,"Knezmost, Czech Republic","Mlada Boleslav, Czechia",1.85,80.0,right-handed,,763346.0,2988783.0,254742,24.0,LEH,Czech Republic,"Lehečka, Jiří"
1,12260076,Khachanov K.,khachanov-karen,M,14411,"Dubai, UAE","Moscow, Russia",1.98,87.0,right-handed,2013,877460.0,13580818.0,90080,18.0,KHA,Russia,"Khachanov, Karen"
2,12260077,Alcaraz C.,alcaraz-carlos,M,141553,"El Palmar, Murcia, Spain","El Palmar, Murcia, Spain",1.83,68.0,right-handed,2018,1590518.0,24112308.0,275923,3.0,ALC,Spain,"Alcaraz, Carlos"
3,12260078,Alcaraz C.,alcaraz-carlos,M,141553,"El Palmar, Murcia, Spain","El Palmar, Murcia, Spain",1.83,68.0,right-handed,2018,1590518.0,24112308.0,275923,3.0,ALC,Spain,"Alcaraz, Carlos"
4,12260080,Zverev A.,zverev-alexander,M,60430,"Monte Carlo, Monaco","Hamburg, Germany",1.98,90.0,right-handed,2013,2676831.0,34897595.0,57163,4.0,ZVE,Germany,"Zverev, Alexander"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6138,12384975,Șerban R.,serban-raluca-georgiana,F,832,Unknown,Unknown,,,right-handed,,43717.0,340468.0,103375,200.0,SER,Cyprus,"Serban, Raluca Georgiana"
6139,12385017,Pedretti T.,pedretti-thaisa-grana,F,453,Unknown,Unknown,,,,,2598.0,70165.0,120664,647.0,PED,Brazil,"Pedretti, Thaisa Grana"
6140,12385869,Fukuda S.,fukuda-sora,M,139,"Bradenton, FL",Japan,1.80,72.0,left-handed,,1278.0,52832.0,103913,611.0,FUK,Japan,"Fukuda, Sora"
6141,12385873,Uchida K.,uchida-kaichi,M,760,Unknown,Unknown,1.80,,right-handed,2013,10488.0,444064.0,82579,483.0,UCH,Japan,"Uchida, Kaichi"


In [31]:
# We fill null values of highet and weight columns using trimmed_mean of those column
MatchAwayTeamInfo['height'].astype('float64')
MatchAwayTeamInfo['weight'].astype('float64')
trimmed_mean_height = stats.trim_mean(MatchAwayTeamInfo['height'].dropna(), proportiontocut=0.05)
trimmed_mean_weight = stats.trim_mean(MatchAwayTeamInfo['weight'].dropna(), proportiontocut=0.05)
MatchAwayTeamInfo.fillna({'height':trimmed_mean_height, 'weight':trimmed_mean_weight}, inplace=True)


In [32]:
# for choosing between right or left hand,
# first we calculate the percentage of right handed and left handed players
available_data = MatchAwayTeamInfo.loc[MatchAwayTeamInfo['plays'].notnull()].shape[0]
right_handed_per = len(MatchAwayTeamInfo.loc[MatchAwayTeamInfo['plays'].eq('right-handed')]) / available_data
left_handed_per = len(MatchAwayTeamInfo.loc[MatchAwayTeamInfo['plays'].eq('left-handed')]) / available_data
print(f'Percentage of right handed players: {right_handed_per}')
print(f'Percentage of left handed players: {left_handed_per}')
# to keep the distribution of the original data we fill around 88 percent of null data by right-handed,
# and 12 percent by left-handed
plays_null_indicies = MatchAwayTeamInfo[MatchAwayTeamInfo['plays'].isnull()].index
right_handed_filling = int(len(plays_null_indicies) * right_handed_per)
# random indicies from null values will be selected to fill by right-handed
select_right_indicies = np.random.choice(plays_null_indicies,right_handed_filling,replace=False)
MatchAwayTeamInfo.loc[select_right_indicies,'plays'] = 'right-handed'

Percentage of right handed players: 0.8760053619302949
Percentage of left handed players: 0.12332439678284182


In [33]:
# The rest will be filled by 'left-handed'
MatchAwayTeamInfo.fillna({'plays':'left-handed'}, inplace=True)

In [34]:
# There is no relevant data which can help us to find the year when a player became pro,
# so the null values will be filled by 'Unknown'
MatchAwayTeamInfo.fillna({'turned_pro': 'Unknown'}, inplace=True)

In [35]:
# Median will be used to fill null values in current_prize and total_prize
MatchAwayTeamInfo['current_prize'] =  MatchAwayTeamInfo['current_prize'].astype('float64')
MatchAwayTeamInfo['total_prize'] = MatchAwayTeamInfo['total_prize'].astype('float64')

In [36]:
curren_prize_median = MatchAwayTeamInfo['current_prize'].median()
total_prize_median = MatchAwayTeamInfo['total_prize'].median()
MatchAwayTeamInfo.fillna({'current_prize':curren_prize_median,'total_prize':total_prize_median}, inplace=True)

In [37]:
# there is no information here that can help us to find Ranking, 
# so we replace null rankaning with -1 which indicate 'Not Ranked'
MatchAwayTeamInfo.fillna({'current_rank':-1}, inplace=True)

In [38]:
MatchAwayTeamInfo.isnull().sum()
# Now the data is clean

match_id         0
name             0
slug             0
gender           0
user_count       0
residence        0
birthplace       0
height           0
weight           0
plays            0
turned_pro       0
current_prize    0
total_prize      0
player_id        0
current_rank     0
name_code        0
country          0
full_name        0
dtype: int64

### Cleaning MatchHomeTeamInfo

In [39]:
MatchHomeTeamInfo.isnull().sum()

match_id            0
name                0
slug                0
gender             14
user_count          0
residence        4938
birthplace       3027
height           3133
weight           5011
plays            3545
turned_pro       5454
current_prize      51
total_prize        20
player_id           0
current_rank       75
name_code           0
country             0
full_name           0
dtype: int64

In [40]:
MatchHomeTeamInfo.loc[MatchHomeTeamInfo['gender'].isnull(), 'slug'].unique()
genders = {'mrva-maxim':'M','werren-paul':'M','massard-loann':'M','tucakovic-suana':'F',
           'cavallo-fernando':'M','karahan-atakan':'M'}


In [41]:
for slug, sex in genders.items():
    MatchHomeTeamInfo.loc[(MatchHomeTeamInfo['gender'].isnull()) & 
                          (MatchHomeTeamInfo['slug'].eq(slug)),'gender'] = sex

Firts we try to check if there is a record in our dataframe which has the birth place of the player that is a missing value in another record.
If the approach above fails, There is no way to fill residence and birthplace by other information in the dataframe, so we fill them with 'Unknown'.

In [42]:
null_birthplace_df = MatchHomeTeamInfo[MatchHomeTeamInfo['birthplace'].isnull()]
grouped_null_birthplace = null_birthplace_df.groupby('slug')
results = []
for slug, group in grouped_null_birthplace:
    original_group = MatchHomeTeamInfo[MatchHomeTeamInfo['slug'] == slug]
    has_non_null = original_group['birthplace'].notnull().any()
    results.append({
        'slug': slug,
        'null_birthplace_rows': len(group),
        'has_non_null_birthplace': has_non_null
        
    })


result_df = pd.DataFrame(results)
result_df.loc[result_df['has_non_null_birthplace'].eq(True)]

Unnamed: 0,slug,null_birthplace_rows,has_non_null_birthplace
39,bagnolini-daniel,1,True
119,cavallo-fernando,1,True
575,naef-celine,2,True


In [43]:
for index, row in result_df.loc[result_df['has_non_null_birthplace']].iterrows():
    slug = row['slug']
    b_place = MatchHomeTeamInfo.loc[MatchHomeTeamInfo['slug'].eq(slug) & 
                                    MatchHomeTeamInfo['birthplace'].notnull(), 'birthplace'].unique()
    MatchHomeTeamInfo.loc[MatchHomeTeamInfo['slug'].eq(slug) & 
                          MatchHomeTeamInfo['birthplace'].isnull(), 'birthplace'] = b_place[0] if len(b_place) > 0 else None

In [44]:
MatchHomeTeamInfo.fillna({'residence':'Unknown', 'birthplace':'unknown'},inplace=True)

In [45]:
# We fill null values of highet and weight columns using trimmed_mean of those column
MatchHomeTeamInfo['height'].astype('float64')
MatchHomeTeamInfo['weight'].astype('float64')
trimmed_mean_height = stats.trim_mean(MatchHomeTeamInfo['height'].dropna(), proportiontocut=0.05)
trimmed_mean_weight = stats.trim_mean(MatchHomeTeamInfo['weight'].dropna(), proportiontocut=0.05)
MatchHomeTeamInfo.fillna({'height':trimmed_mean_height, 'weight':trimmed_mean_weight}, inplace=True)


In [46]:
# for choosing between right or left hand,
# first we calculate the percentage of right handed and left handed players
available_data = MatchHomeTeamInfo.loc[MatchHomeTeamInfo['plays'].notnull()].shape[0]
right_handed_per = len(MatchHomeTeamInfo.loc[MatchHomeTeamInfo['plays'].eq('right-handed')]) / available_data
left_handed_per = len(MatchHomeTeamInfo.loc[MatchHomeTeamInfo['plays'].eq('left-handed')]) / available_data
print(f'Percentage of right handed players: {right_handed_per}')
print(f'Percentage of left handed players: {left_handed_per}')
# to keep the distribution of the original data we fill around 88 percent of null data by right-handed,
# and 12 percent by left-handed
plays_null_indicies = MatchHomeTeamInfo[MatchHomeTeamInfo['plays'].isnull()].index
right_handed_filling = int(len(plays_null_indicies) * right_handed_per)
# random indicies from null values will be selected to fill by right-handed
select_right_indicies = np.random.choice(plays_null_indicies,right_handed_filling,replace=False)
MatchHomeTeamInfo.loc[select_right_indicies,'plays'] = 'right-handed'

Percentage of right handed players: 0.86688
Percentage of left handed players: 0.13152


In [47]:
# The rest will be filled by 'left-handed'
MatchHomeTeamInfo.fillna({'plays':'left-handed'}, inplace=True)

In [48]:
# There is no relevant data which can help us to find the year when a player became pro,
# so the null values will be filled by 'Unknown'
MatchHomeTeamInfo.fillna({'turned_pro': 'Unknown'}, inplace=True)

In [49]:
# Median will be used to fill null values in current_prize and total_prize
MatchHomeTeamInfo['current_prize'].astype('float64')
MatchHomeTeamInfo['total_prize'].astype('float64')
curren_prize_median = MatchHomeTeamInfo['current_prize'].median()
total_prize_median = MatchHomeTeamInfo['total_prize'].median()
MatchHomeTeamInfo.fillna({'current_prize':curren_prize_median,'total_prize':total_prize_median}, inplace=True)

In [50]:
# there is no information here that can help us to find Ranking, 
# # so we replace null rankaning with -1 which indicate 'Not Ranked'
MatchHomeTeamInfo.fillna({'current_rank':-1}, inplace=True)

In [51]:
MatchHomeTeamInfo.isnull().sum()
# Now The data is Clean

match_id         0
name             0
slug             0
gender           0
user_count       0
residence        0
birthplace       0
height           0
weight           0
plays            0
turned_pro       0
current_prize    0
total_prize      0
player_id        0
current_rank     0
name_code        0
country          0
full_name        0
dtype: int64

### Cleaning MatchAwayScoreInfo

In [52]:
MatchAwayScoreInfo.isnull().sum()

match_id                 0
current_score         1674
display_score         1674
period_1              1666
period_2              1809
period_3              7287
period_4              9311
period_5              9316
period_1_tie_break    8697
period_2_tie_break    8809
period_3_tie_break    9155
period_4_tie_break    9318
period_5_tie_break    9318
normal_time           9319
dtype: int64

In [53]:
# There is columns called 'normal_time' which is 100 percent null, we drop this column.
MatchAwayScoreInfo.drop('normal_time', axis=1,inplace=True)

In [54]:
# We fill the null values from periods 3 to the end by -1, which by contract means 'Not Happened'
MatchAwayScoreInfo.fillna({'period_3':-1, 'period_4':-1, 'period_5':-1,
                           'period_1_tie_break':-1,'period_2_tie_break':-1, 'period_3_tie_break':-1,
                          'period_4_tie_break':-1,'period_5_tie_break':-1 },inplace=True)



In [55]:
null_ind = MatchAwayScoreInfo.loc[MatchAwayScoreInfo['period_1'].isnull(),'match_id'].unique()

In [56]:
matched_rows = []

for ind in null_ind:
    matched_data = MatchAwayScoreInfo.loc[MatchAwayScoreInfo['match_id'].eq(ind)]
    if not matched_data['period_1'].isnull().all():  # Check if not all 'period_1' values are null
        matched_rows.append(matched_data)

# Concatenate the list into a new DataFrame
try:
    matched_df = pd.concat(matched_rows, ignore_index=True)
    matched_df
except:
    print('Noting to Concat')


                                    

Noting to Concat


In [57]:
MatchAwayScoreInfo.isnull().sum()

match_id                 0
current_score         1674
display_score         1674
period_1              1666
period_2              1809
period_3                 0
period_4                 0
period_5                 0
period_1_tie_break       0
period_2_tie_break       0
period_3_tie_break       0
period_4_tie_break       0
period_5_tie_break       0
dtype: int64

to calculate best option for these 4 columns: curren_score, display_score, period_1, period_2, 
knn will be used


In [58]:
sub_data_set = MatchAwayScoreInfo[['current_score','display_score','period_1','period_2']]
sub_set_filled = knn_filler(sub_data_set)

In [59]:
# now we replace the columns in the main data frame with data in sub_set_filled
columns = ['current_score','display_score','period_1','period_2']
MatchAwayScoreInfo[columns] = sub_set_filled[columns]

In [60]:
MatchAwayScoreInfo.isnull().sum()

match_id              0
current_score         0
display_score         0
period_1              0
period_2              0
period_3              0
period_4              0
period_5              0
period_1_tie_break    0
period_2_tie_break    0
period_3_tie_break    0
period_4_tie_break    0
period_5_tie_break    0
dtype: int64

### Cleaning MatchHomeScoreInfo

In [61]:
MatchHomeScoreInfo.isnull().sum()

match_id                 0
current_score         1674
display_score         1674
period_1              1666
period_2              1809
period_3              7287
period_4              9311
period_5              9316
period_1_tie_break    8697
period_2_tie_break    8809
period_3_tie_break    9155
period_4_tie_break    9318
period_5_tie_break    9318
normal_time           9319
dtype: int64

In [62]:
# There is columns called 'normal_time' which is 100 percent null, we drop this column.
MatchHomeScoreInfo.drop('normal_time', axis=1,inplace=True)

In [63]:
# We fill the null values from periods 3 to the end by -1, which by contract means 'Not Happened'
MatchHomeScoreInfo.fillna({'period_3':-1, 'period_4':-1, 'period_5':-1,
                           'period_1_tie_break':-1,'period_2_tie_break':-1, 'period_3_tie_break':-1,
                          'period_4_tie_break':-1,'period_5_tie_break':-1 },inplace=True)



to calculate best option for these 4 columns: curren_score, display_score, period_1, period_2, 
knn will be used


In [64]:
sub_data_set = MatchHomeScoreInfo[['current_score','display_score','period_1','period_2']]
sub_set_filled = knn_filler(sub_data_set)

In [65]:
# now we replace the columns in the main data frame with data in sub_set_filled
columns = ['current_score','display_score','period_1','period_2']
MatchHomeScoreInfo[columns] = sub_set_filled[columns]

In [66]:
MatchHomeScoreInfo.isnull().sum()

match_id              0
current_score         0
display_score         0
period_1              0
period_2              0
period_3              0
period_4              0
period_5              0
period_1_tie_break    0
period_2_tie_break    0
period_3_tie_break    0
period_4_tie_break    0
period_5_tie_break    0
dtype: int64

### Cleaning TimeInfo

In [67]:
TimeInfo.isnull().sum()

match_id                             0
period_1                          3659
period_2                          3770
period_3                          7683
period_4                          9311
period_5                          9316
current_period_start_timestamp    3437
dtype: int64

there are around 3500 records that there is no information about their first and second periods. 
It is to big to simply remove the rows so we fill these records using trimmed mean of each columns.

In [68]:
# first we need to calculate how many sets has been played 
match_id_null_time = TimeInfo.loc[TimeInfo['period_1'].isnull() | 
                                  TimeInfo['period_2'].isnull() ,'match_id']
periods = ['period_1', 'period_2', 'period_3', 'period_4', 'period_5']
played_periods = {}
for match_id in match_id_null_time:
    matched_data = MatchAwayScoreInfo.loc[MatchAwayScoreInfo['match_id'].eq(match_id)]
    periods_played = matched_data[periods].ne(-1).sum(axis=1).max()
    played_periods[match_id] = periods_played

In [69]:
mean_time_period1 = TimeInfo['period_1'].mean()
mean_time_period2 = TimeInfo['period_2'].mean()
mean_time_period3 = TimeInfo['period_3'].mean()
mean_time_period4 = TimeInfo['period_4'].mean()
mean_time_period5 = TimeInfo['period_5'].mean()
means = [mean_time_period1,mean_time_period2,mean_time_period3,
         mean_time_period4,mean_time_period5]

In [70]:
means

[2898.1418727915193,
 3269.7008469994594,
 3631.8319070904645,
 3005.125,
 3361.6666666666665]

In [71]:
# Now we update periods by trimmed mean that we calculated based on the number of periods that a match has.
periods = ['period_1', 'period_2', 'period_3', 'period_4', 'period_5']

for match_id in match_id_null_time:
    if match_id in played_periods:
        for i in range(played_periods[match_id]):
            if TimeInfo.loc[TimeInfo['match_id'].eq(match_id), periods[i]].isnull().all():
                TimeInfo.loc[TimeInfo['match_id'] == match_id, periods[i]] = means[i]
            else:
                continue
    else:
        continue


In [72]:
# There are afew games that went on to periods 3 - 5. -1 placed for periods that was not happened.
# now we check the scores if there is -1 for a period we update time by 0.
periods = ['period_3', 'period_4', 'period_5']


for match_id in TimeInfo['match_id']:
    for period in periods:
        score_value = MatchAwayScoreInfo.loc[MatchAwayScoreInfo['match_id'] == match_id, period].values[0]
        if score_value == -1:
            TimeInfo.loc[TimeInfo['match_id'] == match_id, period] = 0

        if pd.isna(TimeInfo.loc[TimeInfo['match_id'] == match_id, period]).values[0]:
            TimeInfo.loc[TimeInfo['match_id'] == match_id, period] = 0


In [73]:
# we use KNN to fill up the current_period_start_timestamp column
time_imputed = knn_filler(TimeInfo,k=10)
TimeInfo['current_period_start_timestamp'] = time_imputed['current_period_start_timestamp']

In [74]:
TimeInfo.isnull().sum()
# Now The data is clean

match_id                          0
period_1                          0
period_2                          0
period_3                          0
period_4                          0
period_5                          0
current_period_start_timestamp    0
dtype: int64

### Cleaning PeriodInfo

In [75]:
PeriodInfo.isnull().sum()

match_id                        0
period                          0
statistic_category_name         0
statistic_name                  0
home_stat                       0
away_stat                       0
compare_code                    0
statistic_type                  0
value_type                      0
home_value                      0
away_value                      0
home_total                 245491
away_total                 245491
dtype: int64

In [76]:
PeriodInfo.loc[PeriodInfo['home_value'].eq(0),'home_total'].unique()

array([ nan,  48.,   1.,  27.,  26.,   6.,  12.,  10.,   0.,   2.,   3.,
        24.,  31.,  57.,  18.,  14.,   9.,  22.,   8.,  16.,   5.,   4.,
        15.,  11.,  25.,  52.,  34.,   7.,  19.,  13.,  41.,  23.,  21.,
        20.,  29.,  51.,  28.,  30.,  17.,  37.,  53.,  44.,  55.,  35.,
        33.,  54.,  32.,  38.,  40.,  36.,  45.,  46.,  74.,  81.,  42.,
        69.,  39.,  60.,  49.,  77.,  91.,  68.,  47.,  89.,  43.,  66.,
        50.,  62.,  61.,  83.,  65.,  56.,  93.,  95.,  76.,  86.,  72.,
        63., 124., 107.,  59.,  78.,  64.,  80.,  79., 108., 102.,  58.,
        73.,  67., 109.,  71.,  85., 113.,  87.,  92.,  90.,  70.,  94.])

In [77]:
PeriodInfo.loc[PeriodInfo['away_value'].eq(0),'away_total'].unique()

array([ nan,   1.,  10.,   7.,   0.,  23.,   3.,  12.,   2.,  40.,   5.,
        14.,  15.,  20.,  46.,  13.,  24.,  17.,   4.,  19.,   9.,  11.,
        27.,  16.,  36.,   6.,  84.,  18.,  41.,  35.,  22.,  32.,  29.,
        34.,   8.,  37.,  39.,  96.,  21.,  38.,  28.,  33.,  25.,  50.,
        44.,  43.,  64.,  31.,  70.,  58.,  26.,  47.,  82.,  67.,  30.,
       100.,  53.,  76.,  68.,  42.,  57.,  49.,  72.,  48.,  55.,  56.,
        62.,  63.,  66.,  59.,  52.,  89.,  54.,  60.,  45., 111.,  85.,
        61.,  71.,  65.,  87.,  69., 105.,  79.,  78.,  91.,  83.,  51.,
        74.,  86., 106.,  73.,  95.,  77.,  97.,  99.,  81.,  75., 124.,
       121.,  94.])

In [78]:
# since there is no clear relatioship between home_value and home_total,
# and also between away_valu and away_total knn will be used to fill these two columns
period_imputed = knn_filler(PeriodInfo,k=20)
PeriodInfo[['home_total','away_total']] = period_imputed[['home_total','away_total']]
# Now The data is clean

### Cleaning OddsInfo

In [79]:
OddsInfo.isnull().sum()

match_id                       0
market_id                      0
market_name                    0
is_live                        0
suspended                      0
initial_fractional_value       0
fractional_value               0
choice_name                    0
choice_source_id               0
winnig                      3074
change                         0
dtype: int64

In [80]:
OddsInfo.head()

Unnamed: 0,match_id,market_id,market_name,is_live,suspended,initial_fractional_value,fractional_value,choice_name,choice_source_id,winnig,change
0,12260075,1,full_time,True,True,1/9,9/11,1,396197223,False,0
1,12260075,1,full_time,True,True,3/3,5/5,2,396197355,True,0
2,12260075,11,first_set_winner,True,False,16/9,5/3,1,396201975,False,0
3,12260075,11,first_set_winner,True,False,0/3,18/4,2,396202026,False,0
4,12260075,12,total_games_won,False,False,9/3,10/3,Over,397558778,False,0


In [81]:
# the relationship is not clear so knn will be used to fill up null values in winning columns.
odd_imputed = knn_filler(OddsInfo,k=20)
OddsInfo['winnig'] = odd_imputed['winnig']

In [82]:
OddsInfo.isnull().sum()
# Now the data is clean

match_id                    0
market_id                   0
market_name                 0
is_live                     0
suspended                   0
initial_fractional_value    0
fractional_value            0
choice_name                 0
choice_source_id            0
winnig                      0
change                      0
dtype: int64

In [85]:
MatchEventInfo.to_parquet('../Cleaned_data/MatchEventInfo.parquet')
MatchVotesInfo.to_parquet('../Cleaned_data/MatchVotesInfo.parquet')
MatchTournamentInfo.to_parquet('../Cleaned_data/MatchTournamentInfo.parquet')
MatchSeasonInfo.to_parquet('../Cleaned_data/MatchSeasonInfo.parquet')
MatchVenueInfo.to_parquet('../Cleaned_data/MatchVenueInfo.parquet')
MatchAwayTeamInfo.to_parquet('../Cleaned_data/MatchAwayTeamInfo.parquet')
MatchRoundInfo.to_parquet('../Cleaned_data/MatchRoundInfo.parquet')
MatchAwayScoreInfo.to_parquet('../Cleaned_data/MatchAwayScoreInfo.parquet')
MatchHomeScoreInfo.to_parquet('../Cleaned_data/MatchHomeScoreInfo.parquet')
MatchHomeTeamInfo.to_parquet('../Cleaned_data/MatchHomeTeamInfo.parquet')
PowerInfo.to_parquet('../Cleaned_data/PowerInfo.parquet')
GameInfo.to_parquet('../Cleaned_data/GameInfo.parquet')
OddsInfo.to_parquet('../Cleaned_data/OddsInfo.parquet')
TimeInfo.to_parquet('../Cleaned_data/TimeInfo.parquet')
PeriodInfo.to_parquet('../Cleaned_data/PeriodInfo.parquet')

In [84]:
PeriodInfo

Unnamed: 0,match_id,period,statistic_category_name,statistic_name,home_stat,away_stat,compare_code,statistic_type,value_type,home_value,away_value,home_total,away_total
0,12260075,ALL,service,aces,2,7,2,positive,event,0,0,164.0,164.0
1,12260075,ALL,service,double_faults,2,1,1,negative,event,0,0,164.0,164.0
2,12260075,ALL,service,first_serve,44/72 (61%),48/65 (74%),2,positive,team,14,60,72.0,65.0
3,12260075,ALL,service,second_serve,26/28 (93%),16/17 (94%),2,positive,team,32,14,28.0,17.0
4,12260075,ALL,service,first_serve_points,30/44 (68%),42/48 (88%),2,positive,team,44,58,44.0,48.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
401276,12386383,3RD,return,first_serve_return_points,6/17 (35%),6/12 (50%),2,positive,team,4,8,17.0,12.0
401277,12386383,3RD,return,second_serve_return_points,1/6 (16%),6/10 (60%),2,positive,team,0,0,6.0,10.0
401278,12386383,3RD,return,return_games_played,4,4,3,positive,event,6,4,164.0,164.0
401279,12386383,3RD,return,break_points_converted,0,2,2,positive,event,0,2,164.0,164.0
