In [2]:
import pandas as pd
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt

In [3]:
MatchEventInfo = pd.read_parquet('../Cleaned_data/MatchEventInfo.parquet')
MatchVotesInfo = pd.read_parquet('../Cleaned_data/MatchVotesInfo.parquet')
MatchTournamentInfo = pd.read_parquet('../Cleaned_data/MatchTournamentInfo.parquet')
MatchSeasonInfo = pd.read_parquet('../Cleaned_data/MatchSeasonInfo.parquet')
MatchVenueInfo = pd.read_parquet('../Cleaned_data/MatchVenueInfo.parquet')
MatchAwayTeamInfo = pd.read_parquet('../Cleaned_data/MatchAwayTeamInfo.parquet')
MatchAwayScoreInfo = pd.read_parquet('../Cleaned_data/MatchAwayScoreInfo.parquet')
MatchRoundInfo = pd.read_parquet('../Cleaned_data/MatchRoundInfo.parquet')
MatchHomeScoreInfo = pd.read_parquet('../Cleaned_data/MatchHomeScoreInfo.parquet')
MatchHomeTeamInfo = pd.read_parquet('../Cleaned_data/MatchHomeTeamInfo.parquet')
PowerInfo = pd.read_parquet('../Cleaned_data/PowerInfo.parquet')
GameInfo = pd.read_parquet('../Cleaned_data/GameInfo.parquet')
OddsInfo = pd.read_parquet('../Cleaned_data/OddsInfo.parquet')
TimeInfo = pd.read_parquet('../Cleaned_data/TimeInfo.parquet')
PeriodInfo = pd.read_parquet('../Cleaned_data/PeriodInfo.parquet')

# Q_10

In [None]:
home_player = MatchHomeTeamInfo[['player_id', 'full_name', 'gender', 'height','weight','plays','current_rank']]
away_player = MatchAwayTeamInfo[['player_id', 'full_name', 'gender','height','weight','plays','current_rank']]

In [None]:
players = pd.concat([home_player,away_player], axis=0)
players.drop_duplicates(inplace=True)
player_with_rank = players.loc[players['current_rank'].ne(-1)]

In [None]:
male_player = player_with_rank[player_with_rank['gender'].eq('M')]
female_player = player_with_rank[player_with_rank['gender'].eq('F')]


In [None]:
players_corr = player_with_rank['height'].corr(player_with_rank['current_rank'])
male_player_corr = male_player['height'].corr(male_player['current_rank'])
female_player_corr = female_player['height'].corr(female_player['current_rank'])
print(f'Height_Rank_correlation for All the player: {players_corr}')
print(f'Height_Rank_correlation for Male the player: {male_player_corr}')
print(f'Height_Rank_correlation for Female the player: {female_player_corr}')

An analysis of the correlation between height and ranking reveals that, for males, there is no strong relationship between these two variables. This suggests that factors other than height are more significant in determining rankings among males. In contrast, for females, there is a somewhat stronger correlation between height and ranking, indicating that height may play a more noticeable role in influencing rankings in this group.

# Q_11

In [None]:
TimeInfo.head()

In [None]:
matchid_index = TimeInfo.set_index(['match_id'])[['period_1','period_2','period_3','period_4','period_5']]
avg_duration = round(matchid_index[matchid_index.ne(0)].mean(axis=1),3)
avg_duration_df = pd.DataFrame({'match_id': avg_duration.index, 'average_duration': avg_duration})
avg_duration_df.reset_index(drop=True, inplace=True)
# Average Duration of Periods in each Match
avg_duration_df

In [None]:
time_subset = TimeInfo.drop(columns=['current_period_start_timestamp'])
time_subset = time_subset.set_index('match_id')
sum_of_periods = (time_subset[time_subset.ne(0)].sum(axis=1)).rename('periods_sum').reset_index()
avg_of_all_matches = sum_of_periods['periods_sum'].mean()   
print(f'The Average Duration of Matches in May: {avg_of_all_matches}')

# Q_12

In [None]:
home_player = MatchHomeTeamInfo[['match_id', 'player_id', 'full_name', 'gender']]
away_player = MatchHomeTeamInfo[['match_id', 'player_id','full_name', 'gender']]
players_in_match = pd.concat([home_player, away_player])

In [None]:
male_matchid = players_in_match.loc[players_in_match['gender'].eq('M'), 'match_id']
femal_matchid = players_in_match.loc[players_in_match['gender'].eq('F'), 'match_id']
male_matchid.drop_duplicates(inplace=True)
femal_matchid.drop_duplicates(inplace=True)

In [None]:
game_info_male = GameInfo[GameInfo['match_id'].isin(male_matchid)]
game_info_female = GameInfo[GameInfo['match_id'].isin(femal_matchid)]

In [None]:
male_game_per_set = (game_info_male
                     .groupby(['match_id', 'set_id'])['game_id']
                     .max()
                     .rename('max_game_id')
                     .reset_index()
                     .groupby('set_id')['max_game_id']
                     .mean()
                     .rename('Mean Game per Set')
                     .reset_index()
                    )
male_game_per_set
                

In [None]:
female_game_per_set = (game_info_female
                .groupby(['match_id', 'set_id'])['game_id']
                .max()
                .rename('max_game_id')
                .reset_index()
                .groupby('set_id')['max_game_id']
                .mean()
                .rename('Mean Game per Set')
                .reset_index())
female_game_per_set

# Q_13

In [None]:
# first we merge all the players
away_players = MatchAwayTeamInfo[['player_id','full_name','plays']]
home_players = MatchHomeTeamInfo[['player_id','full_name','plays']]
players = pd.concat([away_players,home_players])
players.drop_duplicates(inplace=True)

In [None]:
players

In [None]:
num_left_handed = len(players.loc[players['plays'].eq('left-handed')])
num_right_handed = len(players.loc[players['plays'].eq('right-handed')])
num_all_players = players.shape[0]
perc_of_left_handed = num_left_handed / num_all_players
perc_of_right_handed = num_right_handed / num_all_players

In [None]:
print(f"Right Handed Players: {perc_of_right_handed:.2%}")
print(f"Left Handed Players: {perc_of_left_handed:.2%}")

In [None]:
plt.pie([perc_of_left_handed,perc_of_right_handed], colors=['lightgreen','lightblue'],explode=(0.1,0) ,
        autopct='%1.1f%%', labels=['Left Handed','Right Handed'],shadow=True, startangle=140)
plt.title('Distribution of Left Handed and Right Handed Players')
plt.axis('equal')
plt.show()



The Analysis shows most of players (around 75%) are right handed.

# Q_14

In [None]:
ground_count = MatchTournamentInfo['ground_type'].value_counts()

In [None]:
plt.figure(figsize=(6,4))
ground_count.plot(kind='bar')
plt.xlabel('Ground Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend()
plt.show()

The analysis indicates that majority of matches were held on 'Red Clay' ground.

# Q_15

In [None]:
away_players = MatchAwayTeamInfo[['player_id', 'country']]
home_players = MatchHomeTeamInfo[['player_id', 'country']]
players = pd.concat([away_players,home_players])
players.drop_duplicates(inplace=True)
countries_from_players = players['country']

In [None]:
countries_from_venue = MatchVenueInfo['country']
all_unique_countries = len(pd.concat([countries_from_players,countries_from_venue]).unique())
print(f'Countries That are repersented in the dataset: {all_unique_countries}')

# Q_16

In [None]:
away_players = MatchAwayTeamInfo[['player_id', 'full_name', 'gender','current_rank']]
home_players = MatchHomeTeamInfo[['player_id', 'full_name', 'gender','current_rank']]
players = pd.concat([away_players,home_players])

In [None]:
players.drop_duplicates(inplace=True)
top_ten_male = players.query('0 < current_rank <= 10 and gender=="M" ')
top_ten_female = players.query('0 < current_rank <= 10 and gender=="F" ')

In [None]:
top_ten_male.sort_values(by='current_rank')

To finde the winner based on GameInfo dataframe these steps must be taken:
<ol>
    <li>Determine points won by each team in each game</li>
    <li>Aggregate points to determine the game winner</li>
    <li>Aggregate games to determine the set winner</li>
    <li> Determine the match winner</li>
</ol>

In [None]:
# Step 1: 
GameInfo['home_game_win'] = GameInfo['home_score'] > GameInfo['away_score']
GameInfo['away_game_win'] = GameInfo['away_score'] > GameInfo['home_score']

# Step 2: 
game_winners = GameInfo.groupby(['match_id', 'set_id', 'game_id']).agg(
    home_game_wins=('home_game_win', 'sum'),
    away_game_wins=('away_game_win', 'sum')
).reset_index()

# Determine game winners (1 for home, -1 for away)
game_winners['game_winner'] = game_winners.apply(
    lambda row: 1 if row['home_game_wins'] > row['away_game_wins'] else -1, axis=1
)

# Step 3: 
set_winners = game_winners.groupby(['match_id', 'set_id']).agg(
    total_home_game_wins=('game_winner', lambda x: (x == 1).sum()),
    total_away_game_wins=('game_winner', lambda x: (x == -1).sum())
).reset_index()

# Determine set winners (1 for home, -1 for away)
set_winners['set_winner'] = set_winners.apply(
    lambda row: 1 if row['total_home_game_wins'] > row['total_away_game_wins'] else -1, axis=1
)

# Step 4:
match_winners = set_winners.groupby('match_id').agg(
    total_home_set_wins=('set_winner', lambda x: (x == 1).sum()),
    total_away_set_wins=('set_winner', lambda x: (x == -1).sum())
).reset_index()

# Determine match winners (Home or Away)
match_winners['match_winner'] = match_winners.apply(
    lambda row: 'Home' if row['total_home_set_wins'] > row['total_away_set_wins'] else 'Away', axis=1
)


In [None]:
# Since the women and men matches are two different category we add a gender to the match_winners dataframe.
matchid_gender = pd.merge(
    MatchHomeTeamInfo[['match_id', 'gender']],
    MatchAwayTeamInfo[['match_id', 'gender']],
    on='match_id', 
    how='outer', 
    suffixes=['_home', '_away']
    )
matchid_gender['gender'] = matchid_gender.apply(
lambda row: row['gender_away'] if pd.notnull(row['gender_away']) else row['gender_home']
, axis=1)
matchid_gender.drop(['gender_home','gender_away'], axis=1, inplace=True)

In [None]:
match_winners = pd.merge(match_winners, matchid_gender, on='match_id', how='left')
match_winners.drop(['total_home_set_wins','total_away_set_wins'], axis=1, inplace=True)
match_winners.head()

In [None]:
def get_winner_player_id(row):
    match_id = row['match_id']
    match_winner = row['match_winner']
    
    if match_winner == 'Away':
        winner_info = MatchAwayTeamInfo.loc[MatchAwayTeamInfo['match_id'] == match_id, 'player_id']
    elif match_winner == 'Home':
        winner_info = MatchHomeTeamInfo.loc[MatchHomeTeamInfo['match_id'] == match_id, 'player_id']
    else:
        return None  
    
    if len(winner_info) > 0:
        return winner_info.values[0]
    else:
        return None

def get_losser_player_id(row):
    match_id = row['match_id']
    match_winner = row['match_winner']
    
    if match_winner == 'Home':
        winner_info = MatchAwayTeamInfo.loc[MatchAwayTeamInfo['match_id'] == match_id, 'player_id']
    elif match_winner == 'Away':
        winner_info = MatchHomeTeamInfo.loc[MatchHomeTeamInfo['match_id'] == match_id, 'player_id']
    else:
        return None  # Handle unknown match_winner value
    
    if len(winner_info) > 0:
        return winner_info.values[0]
    else:
        return None
    
match_winners['winner_player_id'] = match_winners.apply(lambda row: get_winner_player_id(row), axis=1)
match_winners['loser_player_id'] = match_winners.apply(lambda row: get_losser_player_id(row), axis=1)

In [None]:
match_winners

In [None]:
def player_with_most_wins_against_top_ten(match_winner, top_ten_players, gender):
    # Filter matches based on gender and top ten players
    wins_against_top_ten = match_winner[
        (match_winner['gender'] == gender) &
        (
            ((match_winner['match_winner'] == 'Home') & (~match_winner['winner_player_id'].isin(top_ten_players['player_id'])) & (match_winner['loser_player_id'].isin(top_ten_players['player_id']))) |
            ((match_winner['match_winner'] == 'Away') & (match_winner['winner_player_id'].isin(top_ten_players['player_id'])) & (~match_winner['loser_player_id'].isin(top_ten_players['player_id'])))
        )
    ]
    
    # Count wins by non-top ten players against top ten players
    wins_count = wins_against_top_ten['winner_player_id'].value_counts()
    
    # Find player with most wins against top ten players
    if not wins_count.empty:
        player_most_wins = wins_count.idxmax()
        return player_most_wins, wins_count.max()
    else:
        return None


most_winner_against_top_ten_male = player_with_most_wins_against_top_ten(
    match_winners, top_ten_male,gender='M')
most_winner_against_top_ten_female = player_with_most_wins_against_top_ten(
    match_winners, top_ten_female,'F')


In [None]:
most_winner_male = {'Player_Id':most_winner_against_top_ten_male[0],
                   'Full_Name': MatchAwayTeamInfo.loc[MatchAwayTeamInfo['player_id'].
                                        eq(most_winner_against_top_ten_male[0]),'full_name'].unique(),
                    'Number_Of_Wins': most_winner_against_top_ten_male[1],
                   'Category':"Male"}
most_winner_female = {'Player_Id':most_winner_against_top_ten_female[0],
                   'Full_Name': MatchAwayTeamInfo.loc[MatchAwayTeamInfo['player_id'].
                                        eq(most_winner_against_top_ten_female[0]),'full_name'].unique(),
                     'Number_Of_Wins': most_winner_against_top_ten_female[1],
                     'Category':'Female'}
male_winner = pd.DataFrame(most_winner_male)
female_winner = pd.DataFrame(most_winner_female)
most_winner_combined = pd.concat([male_winner,female_winner], ignore_index=True, keys=['Male','Female'])
most_winner_combined

# Q_17

In [37]:
PeriodInfo.loc[PeriodInfo['statistic_category_name'].eq('service'), 'statistic_name'].unique()

array(['aces', 'double_faults', 'first_serve', 'second_serve',
       'first_serve_points', 'second_serve_points',
       'service_games_played', 'break_points_saved'], dtype=object)

In [7]:
PeriodInfo.head()

Unnamed: 0,match_id,period,statistic_category_name,statistic_name,home_stat,away_stat,compare_code,statistic_type,value_type,home_value,away_value,home_total,away_total
0,12260075,ALL,service,aces,2,7,2,positive,event,0,0,164.0,164.0
1,12260075,ALL,service,double_faults,2,1,1,negative,event,0,0,164.0,164.0
2,12260075,ALL,service,first_serve,44/72 (61%),48/65 (74%),2,positive,team,14,60,72.0,65.0
3,12260075,ALL,service,second_serve,26/28 (93%),16/17 (94%),2,positive,team,32,14,28.0,17.0
4,12260075,ALL,service,first_serve_points,30/44 (68%),42/48 (88%),2,positive,team,44,58,44.0,48.0


Break Points Converted: This is the direct measure of breaks of serve. Each converted break point results in a break of serve.
If the statistic indicates, for example, 5 break points converted, it means the player has broken their opponent's serve 5 times.<br>
'From ChatGPT'

In [50]:
filter_period_all = PeriodInfo[PeriodInfo['period'].ne('ALL')]
count_serve_break = (filter_period_all[filter_period_all['statistic_name']
                    .eq('break_points_converted')]
                    .groupby(['match_id'])['statistic_name']
                    .count()
                    .rename('break_of_serve_count')
                    .reset_index())

In [52]:
mean_serve_break = count_serve_break['break_of_serve_count'].mean()
print(f"Mean of break points converted per match: {mean_serve_break}")

Mean of break points converted per match: 2.2767962308598353


In [1]:
print(Gam.isnull().sum())

NameError: name 'GameInfo' is not defined