# Version 5

### Changelog Version 1

* Added function to combine all ranking data files to one dataframe
* Added plot of big four points over time

### Changelog Version 2

* Added function to clean match_charting dataframes (should work for all match_charting files in the JeffSackman github)
    * Creates date columnns
    * Extracts player names from the match
    * Cross references ranking_dataframe to match players with their ranks at the time
    
### Changelog Version 3

* Split up the function created in changelog 2
* Fixed some bugs with the function

### Changelog Version 4

* Started working with the rally file
* Created plot for Avg # of points played per match according to year
* Added function to categorize/label player ranks
* Created Pie charts according to player rank category
    * Player/Opponent Winners according to rank category
    * Player/Opponent Unforced Errors according to rank category
    * Player/Opponent Points Won according to rank category
    
### Changelog Version 5

* Changed pie charts to barcharts
    * Also included statistics of big four into the charts
* Will work to clean up code
    * i.e. functions instead of just hard coding everything

## Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import datetime

# Data Visualization
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)

# Models
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

## ranking_df

In [2]:
def ranking_df_creator(ranking_file_path):
    ranking_files_list = os.listdir(ranking_file_path)
    ranking_df = pd.DataFrame()
    col_names = ['date', 
               'year',
               'month',
               'day',
               'rank_text',
               'rank_number',
               'movement',
               'direction',
               'age',
               'points',
               'tournaments_played',
               'player_profile',
               'player_name',
               'player_id']
    
    for i in range(len(ranking_files_list)):
        tmp_df = pd.read_csv(str(ranking_file_path + ranking_files_list[i]), header = None, names = col_names)
        ranking_df = pd.concat([ranking_df, tmp_df])
    
    ranking_df['date'] = ranking_df['date'].replace('\.','/', regex=True)
    ranking_df['date'] = pd.to_datetime(ranking_df['date'])
    ranking_df = ranking_df.sort_values(by = 'date')
    ranking_df = ranking_df.reset_index(drop = True)
    
    return ranking_df

ranking_file_path = 'C:/Users/kchu8/Desktop/STA160/Ranking_Files/'
ranking_df = ranking_df_creator(ranking_file_path)

In [3]:
ranking_df.head(3)

Unnamed: 0,date,year,month,day,rank_text,rank_number,movement,direction,age,points,tournaments_played,player_profile,player_name,player_id
0,1973-08-23,1973,8,23,2,2,,,24.0,0,0,/en/players/manuel-orantes/o017/overview,manuel-orantes,o017
1,1973-08-23,1973,8,23,67,67,,,33.0,0,0,/en/players/bob%20-carmichael/c080/overview,bob%20-carmichael,c080
2,1973-08-23,1973,8,23,66,66,,,31.0,0,0,/en/players/juan-gisbert%20sr/g076/overview,juan-gisbert%20sr,g076


## Plot of Big Four

In [4]:
federer_df = ranking_df[ranking_df['player_name'].str.contains('roger-federer')]
djokovic_df = ranking_df[ranking_df['player_name'].str.contains('novak-djokovic')]
nadal_df = ranking_df[ranking_df['player_name'].str.contains('rafael-nadal')]
murray_df = ranking_df[ranking_df['player_name'].str.contains('andy-murray')]

In [5]:
layout = dict(
    title = 'Big Four Points Throughout Career',
    xaxis = dict(
        rangeselector = dict(
            buttons = list([
                dict(count = 60,
                    label = '5 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(count = 36,
                    label = '3 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(count = 12,
                    label = '1 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(step = 'all')
            ])
        ),
        
        rangeslider = dict(
            visible = True
        ),
        
        type = 'date'
    ),
)

trace1 = go.Scatter(
    x = federer_df['date'],
    y = federer_df['points'],
    name = 'Roger Federer',
)

trace2 = go.Scatter(
    x = djokovic_df['date'],
    y = djokovic_df['points'],
    name = 'Novak Djokovic'
)

trace3 = go.Scatter(
    x = nadal_df['date'],
    y = nadal_df['points'],
    name = 'Rafael Nadal'
)

trace4 = go.Scatter(
    x = murray_df['date'],
    y = murray_df['points'],
    name = 'Andy Murray'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)

In [6]:
stats_overview_df = pd.read_csv('C:/Users/kchu8/Desktop/STA160/Charting_Files/charting-m-stats-Overview.csv')
stats_overview_df.head(3)

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,bk_pts,bp_saved,return_pts,return_pts_won,winners,winners_fh,winners_bh,unforced,unforced_fh,unforced_bh
0,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,Total,69,2,1,32,25,37,21,5,4,63,36,28,10,16,16,6,9
1,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,2,Total,63,2,2,41,20,22,7,12,4,69,23,14,8,4,26,15,9
2,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,1,26,2,0,15,10,11,6,2,1,18,13,10,3,5,6,2,4


## match_id_split(), rank_merge() for stats_overview

* Working for last 15 but will take around an hour to run for the entire file

In [7]:
# %%time

def match_id_split(df):
    player1 = []
    player2 = []
    date = []
    year = []
    month = []
    
    for i in range(len(df)):
        df['match_id'] = df['match_id'].str.lower()
        df['match_id'] = df['match_id'].replace('_', '-', regex = True)
        match_id = df['match_id'].str.split('-')[i]
        
        player1.append(match_id[-4:][0] + '-' + match_id[-3:][0])
        player2.append(match_id[-2:][0] + '-' + match_id[-1:][0])
        
        date.append(match_id[0])
        year.append(match_id[0][:4])
        month.append(match_id[0][4:6])
        
    df['player1'] = player1
    df['player2'] = player2
    df['date'] = date
    df['year'] = year
    df['month'] = month
    
    date = []
    for i in range(len(df)):
        date.append('-'.join([df['date'][i][:4], df['date'][i][4:6], df['date'][i][6:]]))
    
    df['date'] = date
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['year'].astype(int)
    df['month'] = df['month'].astype(int)
    
    df = df.sort_values(by = 'date')
    df = df.reset_index(drop = True)
    
    return df

In [8]:
def rank_merge(df, ranking_df):
    player1 = df['player1'].tolist()
    player2 = df['player2'].tolist()
    
    year = df['year'].tolist()
    month = df['month'].tolist()
    
    player1_rank = []
    player2_rank = []
    
    for i in range(len(player1)):
        player1_df = ranking_df[ranking_df['player_name'] == player1[i]]
        player1_df = player1_df[(player1_df['year'] == int(year[i])) 
                                & (player1_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player1_rank.append(player1_df['rank_number'][0])
        except KeyError as e:
            player1_rank.append(None)
        except IndexError as e:
            player1_rank.append(None)
        
        player2_df = ranking_df[ranking_df['player_name'] == player2[i]]
        player2_df = player2_df[(player2_df['year'] == int(year[i])) 
                                & (player2_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player2_rank.append(player2_df['rank_number'][0])
        except KeyError as e:
            player2_rank.append(None)
        except IndexError as e:
            player2_rank.append(None)
    
    df['player1_rank'] = player1_rank
    df['player2_rank'] = player2_rank
    
    return df

In [42]:
test_df = stats_overview_df.tail(15).reset_index(drop = True)
cleaned_test_df = match_id_split(test_df)
cleaned_test_df = rank_merge(cleaned_test_df, ranking_df)

cleaned_test_df.head(3)

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,...,unforced,unforced_fh,unforced_bh,player1,player2,date,year,month,player1_rank,player2_rank
0,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,Total,172,37,13,107,86,65,29,...,45,9,23,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
1,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,Total,154,7,7,97,75,57,36,...,23,4,12,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
2,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,1,40,5,2,24,21,16,7,...,7,1,4,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11


# Serve and Volley Statistics

In [155]:
snv_df = pd.read_csv('C:/Users/kchu8/Desktop/STA160/Charting_Files/charting-m-stats-SnV.csv')
snv_df.head()

Unnamed: 0,match_id,player,row,snv_pts,pts_won,aces,unret,return_forced,net_winner,induced_forced,net_unforced,passed_at_net,passing_shot_induced_forced,total_shots
0,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,SnV,30,23,0,0,8,8,5,2,2,0,85
1,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,SnV1st,30,23,0,0,8,8,5,2,2,0,85
2,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,nonSnV,38,23,2,0,1,3,3,3,2,1,243
3,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,nonSnV1st,2,2,2,0,0,0,0,0,0,0,2
4,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,nonSnV2nd,36,21,0,0,1,3,3,3,2,1,241


In [156]:
snv_df = snv_df[(snv_df['row'] == 'SnV') | (snv_df['row'] == 'nonSnV')].reset_index(drop = True)
snv_df = match_id_split(snv_df)
snv_df = snv_df[['match_id', 'row', 'total_shots', 'player', 'year']]
snv_df.groupby(['match_id', 'row'], as_index = False)[['total_shots']].sum()

non_snv_df = snv_df[snv_df['row'] == 'nonSnV']
snv_df = snv_df[snv_df['row'] == 'SnV']
snv_df = snv_df.groupby(['year'], as_index = False)[['total_shots']].mean()
non_snv_df = non_snv_df.groupby(['year'], as_index = False)[['total_shots']].mean()

non_snv_df = non_snv_df[non_snv_df['year'] > 1979]
snv_df = snv_df[snv_df['year'] > 1979]

In [173]:
layout = go.Layout(
    title = 'Avg # of Shots per Match According to Year',
    yaxis = dict(title = 'Points Played per Match'),
    xaxis = dict(title = 'Year'),
    barmode = 'stack'
)

trace1 = go.Bar(
    x = snv_df['year'],
    y = snv_df['total_shots'],
    name = 'SnV Shots'
)

trace2 = go.Bar(
    x = non_snv_df['year'],
    y = non_snv_df['total_shots'],
    name = 'NonSnV Shots'
)

fig = go.Figure(data = [trace1, trace2], layout = layout)
iplot(fig)

## rally_overview

In [33]:
stats_rally_df = pd.read_csv('C:/Users/kchu8/Desktop/STA160/Charting_Files/charting-m-stats-Rally.csv',
                             encoding = 'latin-1')

stats_rally_df_totals = stats_rally_df[stats_rally_df['row'] == 'Total'].reset_index(drop = True)
stats_rally_df_totals.head(3)

Unnamed: 0,match_id,row,pts,pl1_won,pl1_winners,pl1_forced,pl1_unforced,pl2_won,pl2_winners,pl2_forced,pl2_unforced
0,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,Total,132,82,28,28,15,50,14,20,24
1,19780125-M-Pepsi_Grand_Slam-SF-Brian_Gottfried...,Total,104,44,11,12,30,60,15,14,19
2,19800705-M-Wimbledon-F-John_Mcenroe-Bjorn_Borg,Total,376,184,69,28,99,192,63,27,79


In [11]:
def rank_merge(df, ranking_df):
    player1 = df['player1'].tolist()
    player2 = df['player2'].tolist()
    
    year = df['year'].tolist()
    month = df['month'].tolist()
    
    player1_rank = []
    player2_rank = []
    
    for i in range(len(player1)):
        player1_df = ranking_df[ranking_df['player_name'] == player1[i]]
        player1_df = player1_df[(player1_df['year'] == int(year[i])) 
                                & (player1_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player1_rank.append(player1_df['rank_number'][0])
        except KeyError as e:
            player1_rank.append(None)
        except IndexError as e:
            player1_rank.append(None)
        
        player2_df = ranking_df[ranking_df['player_name'] == player2[i]]
        player2_df = player2_df[(player2_df['year'] == int(year[i])) 
                                & (player2_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player2_rank.append(player2_df['rank_number'][0])
        except KeyError as e:
            player2_rank.append(None)
        except IndexError as e:
            player2_rank.append(None)
    
    df['player1_rank'] = player1_rank
    df['player2_rank'] = player2_rank
    
    return df

In [12]:
def rank_label(df, column):
    df.loc[df[column] > 100, column + '_label'] = 'sucks'
    df.loc[(df[column] <= 100) & (df[column] > 50), column + '_label'] = 'top_100'
    df.loc[(df[column] <= 50) & (df[column] > 10), column + '_label'] = 'top_50'
    df.loc[df[column] <= 10, column + '_label'] = 'top_10'
    
    return df

In [13]:
%%time

stats_rally_df_totals = match_id_split(stats_rally_df_totals)
stats_rally_df_totals = rank_merge(stats_rally_df_totals, ranking_df)
stats_rally_df_totals = rank_label(stats_rally_df_totals, 'player1_rank')
stats_rally_df_totals = rank_label(stats_rally_df_totals, 'player2_rank')

Wall time: 10min 3s


In [14]:
stats_rally_df_totals.head()

Unnamed: 0,match_id,row,pts,pl1_won,pl1_winners,pl1_forced,pl1_unforced,pl2_won,pl2_winners,pl2_forced,pl2_unforced,player1,player2,date,year,month,player1_rank,player2_rank,player1_rank_label,player2_rank_label
0,19740714-m-bastad-f-bjorn-borg-adriano-panatta,Total,232,132,47,43,21,100,33,41,39,bjorn-borg,adriano-panatta,1974-07-14,1974,7,9.0,38.0,top_10,top_50
1,19750101-m-australian-open-f-jimmy-connors-joh...,Total,273,134,48,61,28,139,52,58,18,jimmy-connors,john-newcombe,1975-01-01,1975,1,1.0,2.0,top_10,top_10
2,19751219-m-davis-cup-world-group-f-rr-bjorn-bo...,Total,132,82,28,28,15,50,14,20,24,bjorn-borg,jiri-hrebec,1975-12-19,1975,12,3.0,53.0,top_10,top_100
3,19780125-m-pepsi-grand-slam-sf-brian-gottfried...,Total,104,44,11,12,30,60,15,14,19,brian-gottfried,bjorn-borg,1978-01-25,1978,1,5.0,3.0,top_10,top_10
4,19780611-m-roland-garros-f-bjorn-borg-guillerm...,Total,126,79,18,11,18,47,20,8,45,bjorn-borg,guillermo-vilas,1978-06-11,1978,6,,,,


In [15]:
stats_rally_yearly_totals_df = stats_rally_df_totals.groupby(['year']).mean().reset_index()
stats_rally_yearly_totals_df = stats_rally_yearly_totals_df[stats_rally_yearly_totals_df['year'] > 1979]

layout = go.Layout(
    title = 'Avg # of points per match according to year',
    yaxis = dict(title = 'Points Played per Match'),
    xaxis = dict(title = 'Year')
)

trace1 = go.Bar(
    x = stats_rally_yearly_totals_df['year'],
    y = stats_rally_yearly_totals_df['pts']
)

fig = go.Figure(data = [trace1], layout = layout)
iplot(fig)

In [16]:
stats_rally_ranks_df = stats_rally_df_totals.groupby(['player1_rank_label', 'player2_rank_label']).mean().reset_index()
stats_rally_ranks_df

Unnamed: 0,player1_rank_label,player2_rank_label,pts,pl1_won,pl1_winners,pl1_forced,pl1_unforced,pl2_won,pl2_winners,pl2_forced,pl2_unforced,year,month,player1_rank,player2_rank
0,sucks,sucks,115.569444,58.958333,16.333333,20.347222,19.222222,56.611111,16.152778,18.347222,19.833333,2015.027778,5.902778,330.194444,270.388889
1,sucks,top_10,159.583333,68.583333,21.166667,25.25,21.583333,91.0,35.583333,29.083333,19.75,2010.333333,4.916667,227.416667,3.083333
2,sucks,top_100,141.263158,66.684211,20.684211,20.052632,28.315789,74.578947,21.631579,21.526316,23.263158,2016.578947,3.157895,162.052632,74.0
3,sucks,top_50,179.826087,84.956522,26.73913,29.391304,25.695652,94.869565,33.391304,32.0,24.73913,2015.478261,5.391304,164.391304,27.043478
4,top_10,sucks,165.578947,89.789474,31.526316,31.684211,23.263158,75.789474,20.0,29.315789,23.210526,2010.526316,4.736842,2.631579,204.842105
5,top_10,top_10,185.901887,93.028302,31.356604,33.196226,27.473585,92.873585,29.750943,32.643396,25.498113,2004.567925,6.816981,3.390566,3.537736
6,top_10,top_100,162.967742,89.274194,28.241935,31.016129,23.66129,73.693548,23.032258,25.064516,27.209677,2011.145161,4.548387,3.177419,67.290323
7,top_10,top_50,173.869565,92.841897,30.158103,32.335968,23.833992,81.027668,25.84585,28.98419,27.189723,2008.909091,6.035573,3.466403,24.193676
8,top_100,sucks,133.882353,70.941176,22.882353,23.294118,24.117647,62.941176,16.294118,19.882353,22.0,2016.823529,3.823529,78.235294,196.882353
9,top_100,top_10,150.405405,65.891892,20.783784,24.702703,24.135135,84.513514,27.945946,28.783784,18.837838,2012.567568,4.918919,69.567568,3.27027


In [17]:
p1_rank_labels = stats_rally_df_totals.groupby(['player1_rank_label']).mean().reset_index()
p2_rank_labels = stats_rally_df_totals.groupby(['player2_rank_label']).mean().reset_index()

In [18]:
rank_label = ['sucks', 'top_10', 'top_100', 'top_50']

player_winners = (p1_rank_labels['pl1_winners'] + p2_rank_labels['pl2_winners']) / 2
player_unforced = (p1_rank_labels['pl1_unforced'] + p2_rank_labels['pl2_unforced']) / 2
player_forced = (p1_rank_labels['pl1_forced'] + p2_rank_labels['pl2_forced']) / 2
player_pts = (p1_rank_labels['pl1_won'] + p2_rank_labels['pl2_won']) / 2

opponent_winners = (p1_rank_labels['pl2_winners'] + p2_rank_labels['pl1_winners']) / 2
opponent_unforced = (p1_rank_labels['pl2_unforced'] + p2_rank_labels['pl2_unforced']) / 2
opponent_forced = (p1_rank_labels['pl2_forced'] + p2_rank_labels['pl1_forced']) / 2
opponent_pts = (p1_rank_labels['pl2_won'] + p2_rank_labels['pl1_won']) / 2

rank_labeled_df = pd.DataFrame({'rank_label': rank_label, 'player_winners': player_winners, 
                               'opponent_winners': opponent_winners, 'player_unforced': player_unforced,
                               'opponent_unforced': opponent_unforced, 'player_forced': player_forced,
                               'opponent_forced': opponent_forced, 'player_pts': player_pts,
                                'opponent_pts': opponent_pts})
rank_labeled_df

Unnamed: 0,rank_label,player_winners,opponent_winners,player_unforced,opponent_unforced,player_forced,opponent_forced,player_pts,opponent_pts
0,sucks,18.804774,20.968817,21.553093,21.067481,21.412478,22.512006,64.32916,67.957208
1,top_10,30.232636,29.104769,25.284093,25.419533,32.583051,31.262865,92.941275,88.408486
2,top_100,22.172234,24.703617,25.064023,23.491031,24.076993,26.223102,72.053507,78.971339
3,top_50,27.346006,27.716807,26.832274,25.272814,28.503753,29.66214,83.049081,87.342713


In [19]:
djokovic_rally_df = stats_rally_df_totals[(stats_rally_df_totals['player1'].str.contains('novak-djokovic')) |
                             (stats_rally_df_totals['player2'].str.contains('novak-djokovic'))]

federer_rally_df = stats_rally_df_totals[(stats_rally_df_totals['player1'].str.contains('roger-federer')) |
                                         (stats_rally_df_totals['player2'].str.contains('roger-federer'))]

nadal_rally_df = stats_rally_df_totals[(stats_rally_df_totals['player1'].str.contains('rafael-nadal')) |
                                        (stats_rally_df_totals['player2'].str.contains('rafael-nadal'))]

murray_rally_df = stats_rally_df_totals[(stats_rally_df_totals['player1'].str.contains('andy-murray')) |
                                        (stats_rally_df_totals['player2'].str.contains('andy-murray'))]

### Note to self
* pl1_forced means that player 1 forced x amount of errors from the opponent

In [20]:
djokovic_p1 = djokovic_rally_df.groupby(['player1']).mean().reset_index()
djokovic_p1 = djokovic_p1[djokovic_p1['player1'].str.contains('novak-djokovic')].reset_index(drop = True)

djokovic_p2 = djokovic_rally_df.groupby(['player2']).mean().reset_index()
djokovic_p2 = djokovic_p2[djokovic_p2['player2'].str.contains('novak-djokovic')].reset_index(drop = True)



djokovic_pts = (djokovic_p1['pl1_won'] + djokovic_p2['pl2_won']) / 2
djokovic_opponent_pts = (djokovic_p1['pl2_won'] + djokovic_p2['pl1_won']) / 2

djokovic_winners = (djokovic_p1['pl1_winners'] + djokovic_p2['pl2_winners']) / 2
djokovic_opponent_winners = (djokovic_p1['pl2_winners'] + djokovic_p2['pl1_winners']) / 2

djokovic_forced = (djokovic_p1['pl1_forced'] + djokovic_p2['pl2_forced']) / 2
djokovic_opponent_forced = (djokovic_p1['pl2_forced'] + djokovic_p2['pl1_forced']) / 2

djokovic_unforced = (djokovic_p1['pl1_unforced'] + djokovic_p2['pl2_unforced']) / 2
djokovic_opponent_unforced = (djokovic_p1['pl2_unforced'] + djokovic_p1['pl1_unforced']) / 2


djokovic_rally_totals = pd.DataFrame({'rank_label': 'novak_djokovic',
                                      'player_pts': djokovic_pts,
                                      'opponent_pts': djokovic_opponent_pts,
                                      'player_winners': djokovic_winners,
                                      'opponent_winners': djokovic_opponent_winners,
                                      'player_unforced': djokovic_unforced,
                                      'opponent_unforced': djokovic_opponent_unforced,
                                      'player_forced' : djokovic_forced,
                                      'opponent_forced': djokovic_opponent_forced
                                     })


###################################


federer_p1 = federer_rally_df.groupby(['player1']).mean().reset_index()
federer_p1 = federer_p1[federer_p1['player1'].str.contains('roger-federer')].reset_index(drop = True)

federer_p2 = federer_rally_df.groupby(['player2']).mean().reset_index()
federer_p2 = federer_p2[federer_p2['player2'].str.contains('roger-federer')].reset_index(drop = True)



federer_pts = (federer_p1['pl1_won'] + federer_p2['pl2_won']) / 2
federer_opponent_pts = (federer_p1['pl2_won'] + federer_p2['pl1_won']) / 2

federer_forced = (federer_p1['pl1_forced'] + federer_p2['pl2_forced']) / 2
federer_opponent_forced = (federer_p1['pl2_forced'] + federer_p2['pl1_forced']) / 2

federer_winners = (federer_p1['pl1_winners'] + federer_p2['pl2_winners']) / 2
federer_opponent_winners = (federer_p1['pl2_winners'] + federer_p2['pl1_winners']) / 2

federer_unforced = (federer_p1['pl1_unforced'] + federer_p2['pl2_unforced']) / 2
federer_opponent_unforced = (federer_p1['pl2_unforced'] + federer_p1['pl1_unforced']) / 2


federer_rally_totals = pd.DataFrame({'rank_label': 'roger_federer',
                                    'player_pts': federer_pts,
                                    'opponent_pts': federer_opponent_pts,
                                    'player_winners': federer_winners,
                                    'opponent_winners': federer_opponent_winners,
                                    'player_unforced': federer_unforced,
                                    'opponent_unforced': federer_opponent_unforced,
                                    'player_forced': federer_forced,
                                    'opponent_forced': federer_opponent_forced})


###################################


nadal_p1 = nadal_rally_df.groupby(['player1']).mean().reset_index()
nadal_p1 = nadal_p1[nadal_p1['player1'].str.contains('rafael-nadal')].reset_index(drop = True)

nadal_p2 = nadal_rally_df.groupby(['player2']).mean().reset_index()
nadal_p2 = nadal_p2[nadal_p2['player2'].str.contains('rafael-nadal')].reset_index(drop = True)



nadal_pts = (nadal_p1['pl1_won'] + nadal_p2['pl2_won']) / 2
nadal_opponent_pts = (nadal_p1['pl2_won'] + nadal_p2['pl1_won']) / 2

nadal_forced = (nadal_p1['pl1_forced'] + nadal_p2['pl2_forced']) / 2
nadal_opponent_forced = (nadal_p1['pl2_forced'] + nadal_p2['pl1_forced']) / 2

nadal_winners = (nadal_p1['pl1_winners'] + nadal_p2['pl2_winners']) / 2
nadal_opponent_winners = (nadal_p1['pl2_winners'] + nadal_p2['pl1_winners']) / 2

nadal_unforced = (nadal_p1['pl1_unforced'] + nadal_p2['pl2_unforced']) / 2
nadal_opponent_unforced = (nadal_p1['pl2_unforced'] + nadal_p1['pl1_unforced']) / 2


nadal_rally_totals = pd.DataFrame({'rank_label': 'rafael_nadal',
                                    'player_pts': nadal_pts,
                                    'opponent_pts': nadal_opponent_pts,
                                    'player_winners': nadal_winners,
                                    'opponent_winners': nadal_opponent_winners,
                                    'player_unforced': nadal_unforced,
                                    'opponent_unforced': nadal_opponent_unforced,
                                    'player_forced': nadal_forced,
                                    'opponent_forced': nadal_opponent_forced})


###################################


murray_p1 = murray_rally_df.groupby(['player1']).mean().reset_index()
murray_p1 = murray_p1[murray_p1['player1'].str.contains('andy-murray')].reset_index(drop = True)

murray_p2 = murray_rally_df.groupby(['player2']).mean().reset_index()
murray_p2 = murray_p2[murray_p2['player2'].str.contains('andy-murray')].reset_index(drop = True)



murray_pts = (murray_p1['pl1_won'] + murray_p2['pl2_won']) / 2
murray_opponent_pts = (murray_p1['pl2_won'] + murray_p2['pl1_won']) / 2

murray_forced = (murray_p1['pl1_forced'] + murray_p2['pl2_forced']) / 2
murray_opponent_forced = (murray_p1['pl2_forced'] + murray_p2['pl1_forced']) / 2

murray_winners = (murray_p1['pl1_winners'] + murray_p2['pl2_winners']) / 2
murray_opponent_winners = (murray_p1['pl2_winners'] + murray_p2['pl1_winners']) / 2

murray_unforced = (murray_p1['pl1_unforced'] + murray_p2['pl2_unforced']) / 2
murray_opponent_unforced = (murray_p1['pl2_unforced'] + murray_p1['pl1_unforced']) / 2


murray_rally_totals = pd.DataFrame({'rank_label': 'andy-murray',
                                    'player_pts': murray_pts,
                                    'opponent_pts': murray_opponent_pts,
                                    'player_winners': murray_winners,
                                    'opponent_winners': murray_opponent_winners,
                                    'player_unforced': murray_unforced,
                                    'opponent_unforced': murray_opponent_unforced,
                                    'player_forced': murray_forced,
                                    'opponent_forced': murray_opponent_forced})


###################################


big_four_rally_df = pd.concat([federer_rally_totals, djokovic_rally_totals, nadal_rally_totals, murray_rally_totals])
big_four_rally_df.set_index('rank_label', inplace = True)
big_four_rally_df.loc['big_four'] = big_four_rally_df.mean()
big_four_rally_df = big_four_rally_df.reset_index()

big_four_rally_df

Unnamed: 0,rank_label,player_pts,opponent_pts,player_winners,opponent_winners,player_unforced,opponent_unforced,player_forced,opponent_forced
0,roger_federer,93.281678,82.584599,32.931206,22.431103,27.455152,26.788793,32.191126,30.813218
1,novak_djokovic,86.252466,78.379084,23.883142,24.192264,25.177111,28.538217,30.550339,26.911188
2,rafael_nadal,89.187788,80.332028,23.934332,29.9447,21.297005,26.707143,30.158525,27.41106
3,andy-murray,84.530707,81.047554,25.700543,26.55163,24.936141,24.467391,27.945924,27.085598
4,big_four,88.313159,80.585816,26.612306,25.779925,24.716352,26.625386,30.211479,28.055266


In [21]:
layout = go.Layout(
    title = 'Player/Opponent Points Won'
)

trace1 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['player_pts'],
    name = 'Big Four Points'
)

trace2 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['opponent_pts'],
    name = 'Big Four Opponent Points'
)

trace3 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['player_pts'],
    name = 'Player Points'
)

trace4 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['opponent_pts'],
    name = 'Opponent Points'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)

In [22]:
layout = go.Layout(
    title = 'Player/Opponent Forced Errors'
)

trace1 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['player_forced'],
    name = 'Big Four Forced'
)

trace2 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['opponent_forced'],
    name = 'Big Four Opponent Forced'
)

trace3 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['player_forced'],
    name = 'Player Forced'
)

trace4 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['opponent_forced'],
    name = 'Opponent Forced'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)

In [23]:
layout = go.Layout(
    title = 'Player/Opponent Unforced Errors'
)

trace1 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['player_unforced'],
    name = 'Big Four Unforced'
)

trace2 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['opponent_unforced'],
    name = 'Big Four Opponent Unforced'
)

trace3 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['player_unforced'],
    name = 'Player Unforced'
)

trace4 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['opponent_unforced'],
    name = 'Opponent Unforced'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)

In [24]:
layout = go.Layout(
    title = 'Player/Opponent Winners'
)

trace1 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['player_winners'],
    name = 'Big Four Winners'
)

trace2 = go.Bar(
    x = ['big_four'],
    y = big_four_rally_df[big_four_rally_df['rank_label'] == 'big_four']['opponent_winners'],
    name = 'Big Four Opponent Winners'
)

trace3 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['player_winners'],
    name = 'Player Winners'
)

trace4 = go.Bar(
    x = rank_labeled_df['rank_label'],
    y = rank_labeled_df['opponent_winners'],
    name = 'Opponent Winners'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)