# Prep 500

## Purpose
In this notebook we will begin our prep for research question 3 - 'Can we predict the next young Grand Slam winners?'. 
Primarily, we will focus on creating 2 dataframes one containing the top winners under 23 from 2014-2017 and their average match statistics and the other dataframe containing all previous Grand Slam winners from 1991-present and their match statisics from when they were under 23.

## Datasets
* The data in this notebook;
    * Men's Singles Matches from 1968 to 2017.
* These datasets have been cleaned and are now in appropriate dataframes. They are used in this notebook to create the question-specific dataframes for Research Question 3; "Can we predict the next young Grand Slam winner?".

In [1]:
#importing relevant libraries
import os
import sys
import hashlib
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
 
%matplotlib inline

In [2]:
atp_main = pd.read_csv("../data/atp_main", low_memory = False)

## Grand Slam winner 1991 - present

Creating dataframe only containg from the years 1991 -present. We chose 1991 as the match statistics only start to be filled in in our dataset from 1991 to present.

In [3]:
atp_1990 = atp_main.loc[atp_main['match_year'] > 1990]

In [4]:
atp_1990.to_csv('../data/atp_1990', index = False, encoding='utf-8')

In [5]:
atp_df_1990 = pd.read_csv("../data/atp_1990", low_memory = False)

In [6]:
#grouping by winner name and winner age and getting mean of everything else in the df
winner_stats = atp_df_1990.groupby(['winner_name', 'winner_age']).mean()
#only using averaged columns 16-25
winner_stats = winner_stats.iloc[:,16:25].reset_index()
winner_stats.head()

Unnamed: 0,winner_name,winner_age,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced
0,Aaron Krickstein,23.0,4.75,2.0,98.25,59.25,42.0,21.0,15.75,4.75,8.0
1,Aaron Krickstein,24.0,5.035714,2.357143,88.678571,51.357143,37.642857,20.785714,13.821429,4.392857,6.607143
2,Aaron Krickstein,25.0,3.461538,1.269231,67.730769,38.0,27.730769,16.923077,10.961538,3.153846,5.076923
3,Aaron Krickstein,26.0,5.033333,2.033333,85.366667,50.3,36.2,18.866667,12.766667,5.3,7.666667
4,Aaron Krickstein,27.0,5.84375,1.875,90.0,54.3125,39.5625,20.03125,13.5,3.5625,5.5625


In [7]:
#winners under 24 years of age
winner_23under = winner_stats.loc[winner_stats['winner_age'] < 24]

In [8]:
#Getting average of each player by grouping winner name. Before we had average of each player each time they played 
#at a certain age
winner_23 = winner_23under.groupby(['winner_name']).mean().reset_index()
winner_23.head()

Unnamed: 0,winner_name,winner_age,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced
0,Aaron Krickstein,23.0,4.75,2.0,98.25,59.25,42.0,21.0,15.75,4.75,8.0
1,Abdelhak Hameurlaine,20.0,,,,,,,,,
2,Abdullah Maqdas,19.333333,,,,,,,,,
3,Adam Chadaj,21.0,1.0,5.0,89.0,53.0,37.0,16.0,14.0,6.0,9.0
4,Adam Kellner,22.0,,,,,,,,,


In [9]:
#Reading in list of all previous players who have won a Grand Slam from 1991 - present
list = ['Boris Becker', 'Jim Courier', 'Michael Stich', 'Stefan Edberg', 'Andre Agassi', 'Sergi Bruguera', 'Pete Sampras', 
       'Thomas Muster', 'Yevgeny Kafelnikov', 'Richard Krajicek', 'Gustavo Kuerten', 'Patrick Rafter', 'Petr Korda',
       'Carlos Moya', 'Marat Safin', 'Goran Ivanisvic', 'Lleyton Hewitt', 'Thomas Johansson', 'Albert Costa', 
        'Juan Carlos Ferrero', 'Roger Federer', 'Gastón Gaudio', 'Rafael Nadal', 'Novak Djokovic', 'Juan Martín del Potro',
       'Andy Murray', 'Stan Wawrinka', 'Marin Čilić']

In [10]:
#Checking if names in the list appear in dataframe
NamePositions = winner_23.isin(list)

In [11]:
#Showing indexes where a name in the list appears in the dataframe
NamePositions[NamePositions.winner_name].head()

Unnamed: 0,winner_name,winner_age,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced
17,True,False,False,False,False,False,False,False,False,False,False
74,True,False,False,False,False,False,False,False,False,False,False
102,True,False,False,False,False,False,False,False,False,False,False
162,True,False,False,False,False,False,False,False,False,False,False
195,True,False,False,False,False,False,False,False,False,False,False


In [12]:
#Creating dataframe from the given indexes above
grandSlam = winner_23.loc[[19,81,112,177,212,495,639,707,804,843,952,1036,1073,1090,1099,1128,1159,
                           1181,1243,1314,1317,1408]].reset_index()

In [13]:
#reseting index column
gs = grandSlam.drop('index', axis = 1)

In [14]:
#Dropping the winner age column
GrandSlamWinners = gs.drop('winner_age', axis=1)
GrandSlamWinners.head()

Unnamed: 0,winner_name,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced
0,Albert Portas,1.180556,1.277778,69.930556,47.680556,30.763889,12.958333,10.027778,4.5,6.25
1,Andreas Vinciguerra,4.610427,2.844558,84.466325,51.211937,38.706068,17.272821,13.397806,4.318319,6.430142
2,Aqeel Khan,,,,,,,,,
3,Bruno Abdel Nour,,,,,,,,,
4,Chris Guccione,15.82381,2.935714,79.882143,46.804762,39.197619,18.913095,13.282143,2.857143,3.684524


In [15]:
GrandSlamWinners.to_csv('../data/GrandSlamWinners', index = False, encoding='utf-8')

## Under 23 players 2014 - 2017

In [16]:
Winner_under23 = atp_1990.loc[atp_1990['winner_age'] < 23]

In [17]:
Winner_under23.to_csv('../data/Winner_under23', index = False, encoding='utf-8')

In [18]:
Winner_und23 = pd.read_csv("../data/Winner_under23", low_memory = False)

In [23]:
#Cutting the dataframe to show winners under 23 from 2014 forward
Winners_2014_23 = Winner_und23.loc[Winner_und23['match_year'] > 2013]
Winners_2014_23.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,match_year
17555,2014-301,Auckland,Hard,28,A,20140106,12,106058,,WC,...,0.0,1.0,51.0,34.0,24.0,9.0,9.0,1.0,3.0,2014.0
17556,2014-301,Auckland,Hard,28,A,20140106,20,106058,,WC,...,3.0,1.0,57.0,32.0,23.0,14.0,10.0,2.0,5.0,2014.0
17557,2014-403,Miami Masters,Hard,96,M,20140319,5,106058,,Q,...,8.0,1.0,45.0,26.0,19.0,12.0,9.0,1.0,3.0,2014.0
17558,2014-403,Miami Masters,Hard,96,M,20140319,8,105992,,WC,...,2.0,3.0,59.0,39.0,19.0,12.0,9.0,7.0,11.0,2014.0
17559,2014-403,Miami Masters,Hard,96,M,20140319,26,106210,,,...,1.0,3.0,72.0,43.0,32.0,14.0,11.0,8.0,10.0,2014.0


In [20]:
#Getting the mean accross the dataframe by grouping by winner name
Under23_14_17 = Winners_2014_23.groupby(['winner_name']).mean()
#only using wining stats columns 17-26
Under23_14_17 = Under23_14_17.iloc[:,17:26].reset_index()

In [21]:
Under23_14_17.to_csv('../data/Under23_14_17', index = False, encoding='utf-8')