# Version 4

### Changelog Version 1

* Added function to combine all ranking data files to one dataframe
* Added plot of big four points over time

### Changelog Version 2

* Added function to clean match_charting dataframes (should work for all match_charting files in the JeffSackman github)
    * Creates date columnns
    * Extracts player names from the match
    * Cross references ranking_dataframe to match players with their ranks at the time
    
### Changelog Version 3

* Split up the function created in changelog 2
* Fixed some bugs with the function

### Changelog Version 4

* Started working with the rally file
* Created plot for Avg # of points played per match according to year
* Added function to categorize/label player ranks
* Created Pie charts according to player rank category
    * Player/Opponent Winners according to rank category
    * Player/Opponent Unforced Errors according to rank category
    * Player/Opponent Points Won according to rank category

## Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import datetime

# Data Visualization
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)

# Models
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

## ranking_df

In [2]:
def ranking_df_creator(ranking_file_path):
    ranking_files_list = os.listdir(ranking_file_path)
    ranking_df = pd.DataFrame()
    col_names = ['date', 
               'year',
               'month',
               'day',
               'rank_text',
               'rank_number',
               'movement',
               'direction',
               'age',
               'points',
               'tournaments_played',
               'player_profile',
               'player_name',
               'player_id']
    
    for i in range(len(ranking_files_list)):
        tmp_df = pd.read_csv(str(ranking_file_path + ranking_files_list[i]), header = None, names = col_names)
        ranking_df = pd.concat([ranking_df, tmp_df])
    
    ranking_df['date'] = ranking_df['date'].replace('\.','/', regex=True)
    ranking_df['date'] = pd.to_datetime(ranking_df['date'])
    ranking_df = ranking_df.sort_values(by = 'date')
    ranking_df = ranking_df.reset_index(drop = True)
    
    return ranking_df

ranking_file_path = 'C:/Users/kchu8/Desktop/STA160/Ranking_Files/'
ranking_df = ranking_df_creator(ranking_file_path)

In [3]:
ranking_df.head(3)

Unnamed: 0,date,year,month,day,rank_text,rank_number,movement,direction,age,points,tournaments_played,player_profile,player_name,player_id
0,1973-08-23,1973,8,23,2,2,,,24.0,0,0,/en/players/manuel-orantes/o017/overview,manuel-orantes,o017
1,1973-08-23,1973,8,23,67,67,,,33.0,0,0,/en/players/bob%20-carmichael/c080/overview,bob%20-carmichael,c080
2,1973-08-23,1973,8,23,66,66,,,31.0,0,0,/en/players/juan-gisbert%20sr/g076/overview,juan-gisbert%20sr,g076


## Plot of Big Four

In [4]:
federer_df = ranking_df[ranking_df['player_name'].str.contains('roger-federer')]
djokovic_df = ranking_df[ranking_df['player_name'].str.contains('novak-djokovic')]
nadal_df = ranking_df[ranking_df['player_name'].str.contains('rafael-nadal')]
murray_df = ranking_df[ranking_df['player_name'].str.contains('andy-murray')]

In [5]:
layout = dict(
    title = 'Big Four Points Throughout Career',
    xaxis = dict(
        rangeselector = dict(
            buttons = list([
                dict(count = 60,
                    label = '5 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(count = 36,
                    label = '3 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(count = 12,
                    label = '1 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(step = 'all')
            ])
        ),
        
        rangeslider = dict(
            visible = True
        ),
        
        type = 'date'
    ),
)

trace1 = go.Scatter(
    x = federer_df['date'],
    y = federer_df['points'],
    name = 'Roger Federer',
)

trace2 = go.Scatter(
    x = djokovic_df['date'],
    y = djokovic_df['points'],
    name = 'Novak Djokovic'
)

trace3 = go.Scatter(
    x = nadal_df['date'],
    y = nadal_df['points'],
    name = 'Rafael Nadal'
)

trace4 = go.Scatter(
    x = murray_df['date'],
    y = murray_df['points'],
    name = 'Andy Murray'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)

In [6]:
stats_overview_df = pd.read_csv('C:/Users/kchu8/Desktop/STA160/Charting_Files/charting-m-stats-Overview.csv')
stats_overview_df.head(3)

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,bk_pts,bp_saved,return_pts,return_pts_won,winners,winners_fh,winners_bh,unforced,unforced_fh,unforced_bh
0,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,Total,69,2,1,32,25,37,21,5,4,63,36,28,10,16,16,6,9
1,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,2,Total,63,2,2,41,20,22,7,12,4,69,23,14,8,4,26,15,9
2,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,1,26,2,0,15,10,11,6,2,1,18,13,10,3,5,6,2,4


## match_id_split(), rank_merge() for stats_overview

* Working for last 15 but will take around an hour to run for the entire file

In [7]:
# %%time

def match_id_split(df):
    player1 = []
    player2 = []
    date = []
    year = []
    month = []
    
    for i in range(len(df)):
        df['match_id'] = df['match_id'].str.lower()
        df['match_id'] = df['match_id'].replace('_', '-', regex = True)
        match_id = df['match_id'].str.split('-')[i]
        
        player1.append(match_id[-4:][0] + '-' + match_id[-3:][0])
        player2.append(match_id[-2:][0] + '-' + match_id[-1:][0])
        
        date.append(match_id[0])
        year.append(match_id[0][:4])
        month.append(match_id[0][4:6])
        
    df['player1'] = player1
    df['player2'] = player2
    df['date'] = date
    df['year'] = year
    df['month'] = month
    
    date = []
    for i in range(len(df)):
        date.append('-'.join([df['date'][i][:4], df['date'][i][4:6], df['date'][i][6:]]))
    
    df['date'] = date
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['year'].astype(int)
    df['month'] = df['month'].astype(int)
    
    df = df.sort_values(by = 'date')
    df = df.reset_index(drop = True)
    
    return df

In [8]:
def rank_merge(df, ranking_df):
    player1 = df['player1'].tolist()
    player2 = df['player2'].tolist()
    
    year = df['year'].tolist()
    month = df['month'].tolist()
    
    player1_rank = []
    player2_rank = []
    
    for i in range(len(player1)):
        player1_df = ranking_df[ranking_df['player_name'] == player1[i]]
        player1_df = player1_df[(player1_df['year'] == int(year[i])) 
                                & (player1_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player1_rank.append(player1_df['rank_number'][0])
        except KeyError as e:
            player1_rank.append(None)
        except IndexError as e:
            player1_rank.append(None)
        
        player2_df = ranking_df[ranking_df['player_name'] == player2[i]]
        player2_df = player2_df[(player2_df['year'] == int(year[i])) 
                                & (player2_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player2_rank.append(player2_df['rank_number'][0])
        except KeyError as e:
            player2_rank.append(None)
        except IndexError as e:
            player2_rank.append(None)
    
    df['player1_rank'] = player1_rank
    df['player2_rank'] = player2_rank
    
    return df

In [9]:
test_df = stats_overview_df.tail(15).reset_index(drop = True)
cleaned_test_df = match_id_split(test_df)
cleaned_test_df = rank_merge(cleaned_test_df, ranking_df)

cleaned_test_df.head(3)

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,...,unforced,unforced_fh,unforced_bh,player1,player2,date,year,month,player1_rank,player2_rank
0,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,Total,172,37,13,107,86,65,29,...,45,9,23,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
1,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,Total,154,7,7,97,75,57,36,...,23,4,12,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
2,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,1,40,5,2,24,21,16,7,...,7,1,4,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11


## rally_overview

In [10]:
stats_rally_df = pd.read_csv('C:/Users/kchu8/Desktop/STA160/Charting_Files/charting-m-stats-Rally.csv',
                             encoding = 'latin-1')

stats_rally_df_totals = stats_rally_df[stats_rally_df['row'] == 'Total'].reset_index(drop = True)
stats_rally_df_totals.head(3)

Unnamed: 0,match_id,row,pts,pl1_won,pl1_winners,pl1_forced,pl1_unforced,pl2_won,pl2_winners,pl2_forced,pl2_unforced
0,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,Total,132,82,28,28,15,50,14,20,24
1,19780125-M-Pepsi_Grand_Slam-SF-Brian_Gottfried...,Total,104,44,11,12,30,60,15,14,19
2,19800705-M-Wimbledon-F-John_Mcenroe-Bjorn_Borg,Total,376,184,69,28,99,192,63,27,79


In [11]:
def rank_merge(df, ranking_df):
    player1 = df['player1'].tolist()
    player2 = df['player2'].tolist()
    
    year = df['year'].tolist()
    month = df['month'].tolist()
    
    player1_rank = []
    player2_rank = []
    
    for i in range(len(player1)):
        player1_df = ranking_df[ranking_df['player_name'] == player1[i]]
        player1_df = player1_df[(player1_df['year'] == int(year[i])) 
                                & (player1_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player1_rank.append(player1_df['rank_number'][0])
        except KeyError as e:
            player1_rank.append(None)
        except IndexError as e:
            player1_rank.append(None)
        
        player2_df = ranking_df[ranking_df['player_name'] == player2[i]]
        player2_df = player2_df[(player2_df['year'] == int(year[i])) 
                                & (player2_df['month'] == int(month[i]))].reset_index(drop = True)
        try:
            player2_rank.append(player2_df['rank_number'][0])
        except KeyError as e:
            player2_rank.append(None)
        except IndexError as e:
            player2_rank.append(None)
    
    df['player1_rank'] = player1_rank
    df['player2_rank'] = player2_rank
    
    return df

In [12]:
def rank_label(df, column):
    df.loc[df[column] > 100, column + '_label'] = 'sucks'
    df.loc[df[column] <= 100, column + '_label'] = 'top_100'
    df.loc[df[column] <= 50, column + '_label'] = 'top_50'
    df.loc[df[column] <= 10, column + '_label'] = 'top_10'
    
    return df

In [13]:
%%time

stats_rally_df_totals = match_id_split(stats_rally_df_totals)
stats_rally_df_totals = rank_merge(stats_rally_df_totals, ranking_df)
stats_rally_df_totals = rank_label(stats_rally_df_totals, 'player1_rank')
stats_rally_df_totals = rank_label(stats_rally_df_totals, 'player2_rank')

Wall time: 10min 34s


In [14]:
stats_rally_df_totals.head()

Unnamed: 0,match_id,row,pts,pl1_won,pl1_winners,pl1_forced,pl1_unforced,pl2_won,pl2_winners,pl2_forced,pl2_unforced,player1,player2,date,year,month,player1_rank,player2_rank,player1_rank_label,player2_rank_label
0,19740714-m-bastad-f-bjorn-borg-adriano-panatta,Total,232,132,47,43,21,100,33,41,39,bjorn-borg,adriano-panatta,1974-07-14,1974,7,9.0,38.0,top_10,top_50
1,19750101-m-australian-open-f-jimmy-connors-joh...,Total,273,134,48,61,28,139,52,58,18,jimmy-connors,john-newcombe,1975-01-01,1975,1,1.0,2.0,top_10,top_10
2,19751219-m-davis-cup-world-group-f-rr-bjorn-bo...,Total,132,82,28,28,15,50,14,20,24,bjorn-borg,jiri-hrebec,1975-12-19,1975,12,3.0,53.0,top_10,top_100
3,19780125-m-pepsi-grand-slam-sf-brian-gottfried...,Total,104,44,11,12,30,60,15,14,19,brian-gottfried,bjorn-borg,1978-01-25,1978,1,5.0,3.0,top_10,top_10
4,19780611-m-roland-garros-f-bjorn-borg-guillerm...,Total,126,79,18,11,18,47,20,8,45,bjorn-borg,guillermo-vilas,1978-06-11,1978,6,,,,


In [16]:
stats_rally_yearly_totals_df = stats_rally_df_totals.groupby(['year']).mean().reset_index()
# stats_rally_yearly_totals_df = stats_rally_df_yearly_totals_df[stats_rally_df_yearly_totals_df['year'] > 1979]

# layout = go.Layout(
#     title = 'Avg # of points per match according to year',
#     yaxis = dict(title = 'Points Played per Match'),
#     xaxis = dict(title = 'Year')
# )

# trace1 = go.Bar(
#     x = stats_rally_yearly_totals_df['year'],
#     y = stats_rally_yearly_totals_df['pts']
# )

# fig = go.Figure(data = [trace1], layout = layout)
# iplot(fig)

NameError: name 'stats_rally_df_yearly_totals_df' is not defined

In [None]:
stats_rally_ranks_df = stats_rally_df_totals.groupby(['player1_rank_label', 'player2_rank_label']).mean().reset_index()
stats_rally_ranks_df

In [None]:
p1_rank_labels = stats_rally_df_totals.groupby(['player1_rank_label']).mean().reset_index()
p2_rank_labels = stats_rally_df_totals.groupby(['player2_rank_label']).mean().reset_index()

In [None]:
rank_label = ['sucks', 'top_10', 'top_100', 'top_50']

player_winners = (p1_rank_labels['pl1_winners'] + p2_rank_labels['pl2_winners']) / 2
player_unforced = (p1_rank_labels['pl1_unforced'] + p2_rank_labels['pl2_unforced']) / 2
player_forced = (p1_rank_labels['pl1_forced'] + p2_rank_labels['pl2_forced']) / 2
player_pts = (p1_rank_labels['pl1_won'] + p2_rank_labels['pl2_won']) / 2

opponent_winners = (p1_rank_labels['pl2_winners'] + p2_rank_labels['pl1_winners']) / 2
opponent_unforced = (p1_rank_labels['pl2_unforced'] + p2_rank_labels['pl2_unforced']) / 2
opponent_forced = (p1_rank_labels['pl2_forced'] + p2_rank_labels['pl1_forced']) / 2
opponent_pts = (p1_rank_labels['pl2_won'] + p2_rank_labels['pl1_won']) / 2

rank_labeled_df = pd.DataFrame({'rank_label': rank_label, 'player_winners': player_winners, 
                               'opponent_winners': opponent_winners, 'player_unforced': player_unforced,
                               'opponent_unforced': opponent_unforced, 'player_forced': player_forced,
                               'opponent_forced': opponent_forced, 'player_pts': player_pts,
                                'opponent_pts': opponent_pts})
rank_labeled_df

In [None]:
chart_colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

layout = go.Layout(
    title = 'Player/Opponent Winners According to Rank Label'
)

trace1 = go.Pie(
    title = 'Player Winners',
    titlefont = dict(size = 20),
    labels = rank_labeled_df['rank_label'],
    values = rank_labeled_df['player_winners'],
    hoverinfo = 'label',
    textinfo = 'label+value',
    textfont = dict(size = 15),
    marker = dict(colors = chart_colors,
                  line = dict(color = '#000000', width = 2)),
    domain = dict(x = [0, .5])
)

trace2 = go.Pie(
    title = 'Opponent Winners',
    titlefont = dict(size = 20),
    labels = rank_labeled_df['rank_label'],
    values = rank_labeled_df['opponent_winners'],
    hoverinfo = 'label',
    textinfo = 'label+value',
    textfont = dict(size = 15),
    marker = dict(colors = chart_colors,
                  line = dict(color = '#000000', width = 2)),
    domain = dict(x = [.5, 1])
)

fig = go.Figure(data = [trace1, trace2], layout = layout)
iplot(fig)

In [None]:
chart_colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

layout = go.Layout(
    title = 'Player/Opponent Unforced Errors According to Rank Label'
)

trace1 = go.Pie(
    title = 'Player Unforced Errors',
    titlefont = dict(size = 20),
    labels = rank_labeled_df['rank_label'],
    values = rank_labeled_df['player_unforced'],
    hoverinfo = 'label',
    textinfo = 'label+value',
    textfont = dict(size = 15),
    marker = dict(colors = chart_colors,
                  line = dict(color = '#000000', width = 2)),
    domain = dict(x = [0, .5])
)

trace2 = go.Pie(
    title = 'Opponent Unforced Errors',
    titlefont = dict(size = 20),
    labels = rank_labeled_df['rank_label'],
    values = rank_labeled_df['opponent_unforced'],
    hoverinfo = 'label',
    textinfo = 'label+value',
    textfont = dict(size = 15),
    marker = dict(colors = chart_colors,
                  line = dict(color = '#000000', width = 2)),
    domain = dict(x = [.5, 1])
)

fig = go.Figure(data = [trace1, trace2], layout = layout)
iplot(fig)

In [None]:
chart_colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

layout = go.Layout(
    title = 'Player/Opponent Points Won According to Rank Label'
)

trace1 = go.Pie(
    title = 'Player Points Won',
    titlefont = dict(size = 20),
    labels = rank_labeled_df['rank_label'],
    values = rank_labeled_df['player_pts'],
    hoverinfo = 'label',
    textinfo = 'label+value',
    textfont = dict(size = 15),
    marker = dict(colors = chart_colors,
                  line = dict(color = '#000000', width = 2)),
    domain = dict(x = [0, .5])
)

trace2 = go.Pie(
    title = 'Opponent Points Won',
    titlefont = dict(size = 20),
    labels = rank_labeled_df['rank_label'],
    values = rank_labeled_df['opponent_pts'],
    hoverinfo = 'label',
    textinfo = 'label+value',
    textfont = dict(size = 15),
    marker = dict(colors = chart_colors,
                  line = dict(color = '#000000', width = 2)),
    domain = dict(x = [.5, 1])
)

fig = go.Figure(data = [trace1, trace2], layout = layout)
iplot(fig)