# Version 2

### Changelog Version 1

* Added function to combine all ranking data files to one dataframe
* Added plot of big four points over time

### Changelog Version 2

* Added function to clean match_charting dataframes (should work for all match_charting files in the JeffSackman github)
    * Creates date columnns
    * Extracts player names from the match
    * Cross references ranking_dataframe to match players with their ranks at the time

In [1]:
import pandas as pd
import os
import datetime

# Data Visualization
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)

import warnings
warnings.filterwarnings("ignore")

In [2]:
def ranking_df_creator(ranking_file_path):
    ranking_files_list = os.listdir(ranking_file_path)
    ranking_df = pd.DataFrame()
    col_names = ['date', 
               'year',
               'month',
               'day',
               'rank_text',
               'rank_number',
               'movement',
               'direction',
               'age',
               'points',
               'tournaments_played',
               'player_profile',
               'player_name',
               'player_id']
    
    for i in range(len(ranking_files_list)):
        tmp_df = pd.read_csv(str(ranking_file_path + ranking_files_list[i]), header = None, names = col_names)
        ranking_df = pd.concat([ranking_df, tmp_df])
    
    ranking_df['date'] = ranking_df['date'].replace('\.','/', regex=True)
    ranking_df['date'] = pd.to_datetime(ranking_df['date'])
    ranking_df = ranking_df.sort_values(by = 'date')
    ranking_df = ranking_df.reset_index(drop = True)
    
    return ranking_df

ranking_file_path = 'C:/Users/kchu8/Desktop/STA160/DataFiles/'
ranking_df = ranking_df_creator(ranking_file_path)

In [3]:
ranking_df.head()

Unnamed: 0,date,year,month,day,rank_text,rank_number,movement,direction,age,points,tournaments_played,player_profile,player_name,player_id
0,1973-08-23,1973,8,23,2,2,,,24.0,0,0,/en/players/manuel-orantes/o017/overview,manuel-orantes,o017
1,1973-08-23,1973,8,23,67,67,,,33.0,0,0,/en/players/bob%20-carmichael/c080/overview,bob%20-carmichael,c080
2,1973-08-23,1973,8,23,66,66,,,31.0,0,0,/en/players/juan-gisbert%20sr/g076/overview,juan-gisbert%20sr,g076
3,1973-08-23,1973,8,23,65,65,,,26.0,0,0,/en/players/gerald-battrick/b124/overview,gerald-battrick,b124
4,1973-08-23,1973,8,23,64,64,,,29.0,0,0,/en/players/charlie-pasarell/p072/overview,charlie-pasarell,p072


In [4]:
federer_df = ranking_df[ranking_df['player_name'].str.contains('roger-federer')]
djokovic_df = ranking_df[ranking_df['player_name'].str.contains('novak-djokovic')]
nadal_df = ranking_df[ranking_df['player_name'].str.contains('rafael-nadal')]
murray_df = ranking_df[ranking_df['player_name'].str.contains('andy-murray')]

In [5]:
layout = dict(
    title = 'Big Four Points Throughout Career',
    xaxis = dict(
        rangeselector = dict(
            buttons = list([
                dict(count = 60,
                    label = '5 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(count = 36,
                    label = '3 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(count = 12,
                    label = '1 year',
                    step = 'month',
                    stepmode = 'backward'),
                dict(step = 'all')
            ])
        ),
        
        rangeslider = dict(
            visible = True
        ),
        
        type = 'date'
    ),
)

trace1 = go.Scatter(
    x = federer_df['date'],
    y = federer_df['points'],
    name = 'Roger Federer',
)

trace2 = go.Scatter(
    x = djokovic_df['date'],
    y = djokovic_df['points'],
    name = 'Novak Djokovic'
)

trace3 = go.Scatter(
    x = nadal_df['date'],
    y = nadal_df['points'],
    name = 'Rafael Nadal'
)

trace4 = go.Scatter(
    x = murray_df['date'],
    y = murray_df['points'],
    name = 'Andy Murray'
)

fig = go.Figure(data = [trace1, trace2, trace3, trace4], layout = layout)
iplot(fig)

In [6]:
stats_overview_df = pd.read_csv('charting-m-stats-Overview.csv')
stats_overview_df.head()

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,bk_pts,bp_saved,return_pts,return_pts_won,winners,winners_fh,winners_bh,unforced,unforced_fh,unforced_bh
0,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,Total,69,2,1,32,25,37,21,5,4,63,36,28,10,16,16,6,9
1,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,2,Total,63,2,2,41,20,22,7,12,4,69,23,14,8,4,26,15,9
2,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,1,26,2,0,15,10,11,6,2,1,18,13,10,3,5,6,2,4
3,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,2,1,18,0,0,11,3,7,2,3,0,26,10,4,2,2,10,7,3
4,19751219-M-Davis_Cup_World_Group_F-RR-Bjorn_Bo...,1,2,24,0,0,10,9,14,8,1,1,26,10,9,3,6,6,2,4


# Working for last 15 but will take around an hour to run for the entire file

In [63]:
# %%time

def player_split(df, ranking_df):
    player1 = []
    player2 = []
    
    date = []
    year = []
    month = []
    
    player1_rank = []
    player2_rank = []
    
    for i in range(15):
        df['match_id'] = df['match_id'].str.lower()
        df['match_id'] = df['match_id'].replace('_', '-', regex = True)
        match_id = df['match_id'].str.split('-')[i]
        
        player1.append(match_id[-4:][0] + '-' + match_id[-3:][0])
        player2.append(match_id[-2:][0] + '-' + match_id[-1:][0])
        
        date.append(match_id[0])
        year.append(match_id[0][:4])
        month.append(match_id[0][4:6])
        
    df['player1'] = player1
    df['player2'] = player2
    df['date'] = date
    df['year'] = year
    df['month'] = month
    
    date = []
    for i in range(len(df)):
        date.append('-'.join([df['date'][i][:4], df['date'][i][4:6], df['date'][i][6:]]))
    
    df['date'] = date
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['year'].astype(int)
    df['month'] = df['month'].astype(int)
    
    df = df.sort_values(by = 'date')
    df = df.reset_index(drop = True)
    
    for i in range(len(player1)):
        player1_df = ranking_df[ranking_df['player_name'] == player1[i]]
        player1_df = player1_df[(player1_df['year'] == int(year[i])) 
                                & (player1_df['month'] == int(month[i]))].reset_index(drop = True)
        player1_rank.append(player1_df['rank_number'][0])
        
        player2_df = ranking_df[ranking_df['player_name'] == player2[i]]
        player2_df = player2_df[(player2_df['year'] == int(year[i])) 
                                & (player2_df['month'] == int(month[i]))].reset_index(drop = True)
        player2_rank.append(player2_df['rank_number'][0])
        
    player1_rank.reverse()    
    player2_rank.reverse()
    
    df['player1_rank'] = player1_rank
    df['player2_rank'] = player2_rank
    
    return df
    
test_df = stats_overview_df.tail(15).reset_index(drop = True)
cleaned_test_df = player_split(test_df, ranking_df)
cleaned_test_df.head()

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,...,unforced,unforced_fh,unforced_bh,player1,player2,date,year,month,player1_rank,player2_rank
0,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,Total,172,37,13,107,86,65,29,...,45,9,23,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
1,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,Total,154,7,7,97,75,57,36,...,23,4,12,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
2,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,1,40,5,2,24,21,16,7,...,7,1,4,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
3,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,1,36,2,2,23,16,13,8,...,5,1,2,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
4,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,2,40,11,3,26,23,14,7,...,9,1,5,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11


In [64]:
cleaned_test_df

Unnamed: 0,match_id,player,set,serve_pts,aces,dfs,first_in,first_won,second_in,second_won,...,unforced,unforced_fh,unforced_bh,player1,player2,date,year,month,player1_rank,player2_rank
0,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,Total,172,37,13,107,86,65,29,...,45,9,23,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
1,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,Total,154,7,7,97,75,57,36,...,23,4,12,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
2,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,1,40,5,2,24,21,16,7,...,7,1,4,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
3,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,1,36,2,2,23,16,13,8,...,5,1,2,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
4,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,2,40,11,3,26,23,14,7,...,9,1,5,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
5,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,2,42,3,0,30,23,12,8,...,6,1,5,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
6,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,3,16,2,1,10,2,6,2,...,9,3,5,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
7,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,3,12,0,0,6,6,6,6,...,1,0,1,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
8,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,1,4,45,13,3,31,26,14,6,...,12,3,6,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
9,20010707-m-wimbledon-sf-goran-ivanisevic-tim-h...,2,4,39,1,2,25,20,14,9,...,4,1,1,goran-ivanisevic,tim-henman,2001-07-07,2001,7,125,11
