# 300_prep

## Purpose
* In this notebook we will;
    * identify and create the final, question-specific dataframes for analysis in RQ2
    * save all dataframes in an appropriate folder

## Datasets
* The data in this notebook;
    * Men's Singles Matches from 1968 to 2017.
    * Women's Singles matches from 2000 to 2016.
    * Men's Singles Matches from 2003 to 2014.
* These datasets have been cleaned and are now in appropriate dataframes. They are used in this notebook to create the question-specific dataframes for Research Question 2; "What factors impact the success of a top player?".

In [1]:
import os
import sys
import hashlib
import numpy as np
import pandas as pd
from datetime import datetime
    
%matplotlib inline

In [2]:
# Reading each of the dataframes
atp_main = pd.read_csv("../data/atp_main", low_memory = False, index_col = 'tourney_date')
atp_main.index = pd.to_datetime(atp_main.index, format="%Y-%m-%d", errors='coerce')
atp_small = pd.read_csv("../data/atp_small", low_memory = False)
wta_dataset = pd.read_csv("../data/wta_dataset", low_memory = False)
men_and_women = pd.read_csv("../data/men_and_women", low_memory = False)

## Research Question 2 Preparation
### What factors impact the sucess of a top player?
* To carry out this analysis we need a dataframe of the top players and their corresponding average statistics.

In [3]:
# Using 2003 to 2014 Men's Singles and counting the number of wins
top_10_men = atp_small['winner_name'].value_counts().head(10)
top_10_men

Roger Federer        847
Rafael Nadal         712
Novak Djokovic       612
David Ferrer         597
Andy Roddick         512
Andy Murray          487
Tomas Berdych        485
Nikolay Davydenko    463
Tommy Robredo        429
Fernando Verdasco    404
Name: winner_name, dtype: int64

In [4]:
top_10_women = wta_dataset['winner_name'].value_counts().head(10)
top_10_women

Maria Sharapova        519
Jelena Jankovic        506
Serena Williams        475
Svetlana Kuznetsova    447
Caroline Wozniacki     415
Flavia Pennetta        412
Vera Zvonareva         407
Marion Bartoli         402
Nadia Petrova          402
Agnieszka Radwanska    401
Name: winner_name, dtype: int64

In [5]:
# rq2 is the dataframe for Research Question 2
joined_top_10 = [top_10_men, top_10_women]
rq2 = pd.concat(joined_top_10)

In [6]:
# Indexing rq2 dataframe
df = pd.DataFrame({'col':rq2}).reset_index()
df

Unnamed: 0,index,col
0,Roger Federer,847
1,Rafael Nadal,712
2,Novak Djokovic,612
3,David Ferrer,597
4,Andy Roddick,512
5,Andy Murray,487
6,Tomas Berdych,485
7,Nikolay Davydenko,463
8,Tommy Robredo,429
9,Fernando Verdasco,404


In [7]:
# Adding column names
df.columns = ['winner_name', 'matches_won']
df.columns

Index(['winner_name', 'matches_won'], dtype='object')

In [8]:
# Calculating the number of losses of a player
atp_small[(atp_small['loser_name'] == 'Roger Federer')]['tourney_id'].count()

137

In [9]:
# Total losses for each player counted
# Each value was inserted into this column after it was calculated
df['matches_lost']=(137, 141, 141, 289, 171, 154, 263, 298, 270, 282, 
                   135, 278, 88, 221, 167, 255, 200, 263, 233, 174)

In [10]:
# Adding values to the newly created columns
df['total_matches'] = df['matches_lost'] + df['matches_won']

In [11]:
df.head(5)

Unnamed: 0,winner_name,matches_won,matches_lost,total_matches
0,Roger Federer,847,137,984
1,Rafael Nadal,712,141,853
2,Novak Djokovic,612,141,753
3,David Ferrer,597,289,886
4,Andy Roddick,512,171,683


In [12]:
df['winning_perc'] = df['matches_won'] / df['total_matches'] 

In [13]:
df

Unnamed: 0,winner_name,matches_won,matches_lost,total_matches,winning_perc
0,Roger Federer,847,137,984,0.860772
1,Rafael Nadal,712,141,853,0.834701
2,Novak Djokovic,612,141,753,0.812749
3,David Ferrer,597,289,886,0.673815
4,Andy Roddick,512,171,683,0.749634
5,Andy Murray,487,154,641,0.75975
6,Tomas Berdych,485,263,748,0.648396
7,Nikolay Davydenko,463,298,761,0.60841
8,Tommy Robredo,429,270,699,0.613734
9,Fernando Verdasco,404,282,686,0.588921


## Calculating the average statistics of a player
* In the "df" dataframe, we have 5 columns filled with values that we have calculated. 
* In addition to this we need to include their average statistics per match played.

In [14]:
# Create a small dataframe where column "winner_name" is assigned a player from the table in the above cell
# Locate his/her winning statistic columns (not losing columns), which are columns 31 through 40
# Calculate average
# Adjust indexing 
# Concat all these small dataframes 
a = atp_small[(atp_small['winner_name'] == 'Roger Federer')].iloc[:,31:40].mean()
a = pd.DataFrame(a).reset_index().set_index('index').T

In [15]:
b = atp_small[(atp_small['winner_name'] == 'Rafael Nadal')].iloc[:,31:40].mean()
b = pd.DataFrame(b).reset_index().set_index('index').T

In [16]:
c = atp_small[(atp_small['winner_name'] == 'Novak Djokovic')].iloc[:,31:40].mean()
c = pd.DataFrame(c).reset_index().set_index('index').T

In [17]:
d = atp_small[(atp_small['winner_name'] == 'David Ferrer')].iloc[:,31:40].mean()
d = pd.DataFrame(d).reset_index().set_index('index').T

In [18]:
e = atp_small[(atp_small['winner_name'] == 'Andy Roddick')].iloc[:,31:40].mean()
e = pd.DataFrame(e).reset_index().set_index('index').T

In [19]:
f = atp_small[(atp_small['winner_name'] == 'Andy Murray')].iloc[:,31:40].mean()
f = pd.DataFrame(f).reset_index().set_index('index').T

In [20]:
g = atp_small[(atp_small['winner_name'] == 'Tomas Berdych')].iloc[:,31:40].mean()
g = pd.DataFrame(g).reset_index().set_index('index').T

In [21]:
h = atp_small[(atp_small['winner_name'] == 'Nikolay Davydenko')].iloc[:,31:40].mean()
h = pd.DataFrame(h).reset_index().set_index('index').T

In [22]:
i = atp_small[(atp_small['winner_name'] == 'Tommy Robredo')].iloc[:,31:40].mean()
i = pd.DataFrame(i).reset_index().set_index('index').T

In [23]:
j = atp_small[(atp_small['winner_name'] == 'Fernando Verdasco')].iloc[:,31:40].mean()
j = pd.DataFrame(j).reset_index().set_index('index').T

In [24]:
k = wta_dataset[(wta_dataset['winner_name'] == 'Maria Sharapova')].iloc[:,31:40].mean()
k = pd.DataFrame(k).reset_index().set_index('index').T

In [25]:
l = wta_dataset[(wta_dataset['winner_name'] == 'Jelena Jankovic')].iloc[:,31:40].mean()
l = pd.DataFrame(l).reset_index().set_index('index').T

In [26]:
m = wta_dataset[(wta_dataset['winner_name'] == 'Serena Williams')].iloc[:,31:40].mean()
m = pd.DataFrame(m).reset_index().set_index('index').T

In [27]:
n = wta_dataset[(wta_dataset['winner_name'] == 'Svetlana Kuznetsova')].iloc[:,31:40].mean()
n = pd.DataFrame(n).reset_index().set_index('index').T

In [28]:
o = wta_dataset[(wta_dataset['winner_name'] == 'Caroline Wozniacki')].iloc[:,31:40].mean()
o = pd.DataFrame(o).reset_index().set_index('index').T

In [29]:
p = wta_dataset[(wta_dataset['winner_name'] == 'Flavia Pennetta')].iloc[:,31:40].mean()
p = pd.DataFrame(p).reset_index().set_index('index').T

In [30]:
q = wta_dataset[(wta_dataset['winner_name'] == 'Vera Zvonareva')].iloc[:,31:40].mean()
q = pd.DataFrame(q).reset_index().set_index('index').T

In [31]:
r = wta_dataset[(wta_dataset['winner_name'] == 'Marion Bartoli')].iloc[:,31:40].mean()
r = pd.DataFrame(r).reset_index().set_index('index').T

In [32]:
s = wta_dataset[(wta_dataset['winner_name'] == 'Nadia Petrova')].iloc[:,31:40].mean()
s = pd.DataFrame(s).reset_index().set_index('index').T

In [33]:
t = wta_dataset[(wta_dataset['winner_name'] == 'Agnieszka Radwanska')].iloc[:,31:40].mean()
t = pd.DataFrame(t).reset_index().set_index('index').T

In [34]:
# Concat all these small dataframes
stats = pd.concat([a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t]).reset_index().drop('index', 1)
stats.head()

index,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced
0,7.775434,1.496278,73.7134,46.308933,36.566998,16.404467,12.477667,2.470223,3.42928
1,3.057018,1.394737,70.70614,48.704678,35.869883,13.017544,11.69883,3.038012,4.30848
2,5.762565,2.124783,75.376083,48.790295,36.60312,15.126516,12.343154,3.15078,4.592721
3,3.095238,2.142857,74.359788,47.696649,34.405644,14.977072,11.869489,3.518519,5.250441
4,12.588358,1.659044,72.862786,48.399168,39.432432,14.56341,12.503119,2.087318,2.794179


## Dataframes
### We have 2 dataframes which we will now join;
### df -> top 10 Men and Women players where we added the "matches_won", "matches_lost", "total_matches" and 	"winning_perc" columns.
### stats -> the average statistics for each player in the df dataframe.

In [35]:
# Join names dataframe with mean statistics
RQ2 = pd.concat([df, stats], axis = 1)
RQ2

Unnamed: 0,winner_name,matches_won,matches_lost,total_matches,winning_perc,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced
0,Roger Federer,847,137,984,0.860772,7.775434,1.496278,73.7134,46.308933,36.566998,16.404467,12.477667,2.470223,3.42928
1,Rafael Nadal,712,141,853,0.834701,3.057018,1.394737,70.70614,48.704678,35.869883,13.017544,11.69883,3.038012,4.30848
2,Novak Djokovic,612,141,753,0.812749,5.762565,2.124783,75.376083,48.790295,36.60312,15.126516,12.343154,3.15078,4.592721
3,David Ferrer,597,289,886,0.673815,3.095238,2.142857,74.359788,47.696649,34.405644,14.977072,11.869489,3.518519,5.250441
4,Andy Roddick,512,171,683,0.749634,12.588358,1.659044,72.862786,48.399168,39.432432,14.56341,12.503119,2.087318,2.794179
5,Andy Murray,487,154,641,0.75975,7.134199,2.313853,75.510823,43.852814,33.679654,17.199134,12.287879,3.229437,4.911255
6,Tomas Berdych,485,263,748,0.648396,8.473451,2.20354,73.090708,42.988938,34.495575,16.893805,12.050885,2.942478,4.017699
7,Nikolay Davydenko,463,298,761,0.60841,3.550336,2.322148,71.626398,48.44519,34.888143,12.747204,11.458613,3.579418,5.263982
8,Tommy Robredo,429,270,699,0.613734,4.353081,2.13981,78.277251,51.094787,37.260664,15.116114,12.492891,3.64218,5.414692
9,Fernando Verdasco,404,282,686,0.588921,5.44557,3.564557,75.903797,52.473418,38.675949,12.929114,12.303797,3.592405,5.174684


In [36]:
RQ2['w_df'].mean()

2.5605392197875054

## Loser table
### Exact same operations for the top 10 "worst" players


In [37]:
mens01 = atp_small['loser_name'].value_counts().head()

In [38]:
womens01 = wta_dataset['loser_name'].value_counts().head(30)

In [39]:
both2 = [mens01, womens01]
rq21 = pd.concat(both2)

In [40]:
loser1 = pd.DataFrame({'col':rq21}).reset_index()
loser1.head()

Unnamed: 0,index,col
0,Feliciano Lopez,311
1,Jarkko Nieminen,301
2,Jurgen Melzer,300
3,Nikolay Davydenko,298
4,David Ferrer,289


In [41]:
loser1.columns = ['loser_name', 'matches_lost']

In [42]:
men_and_women[(men_and_women['winner_name'] == 'Agnieszka Radwanska')]['tourney_id'].count()

401

In [43]:
loser1['matches_won']=(348, 359, 326, 463, 597, 399, 404, 264, 429, 330, 485, 347, 218, 243, 223, 197, 305, 210, 260, 333,
                       236, 200, 380, 349, 263, 180, 341, 252, 320, 217,   
                   391, 380, 506, 295, 402, 257, 412, 365, 402, 288, 174, 135, 447, 294, 249, 275, 247, 314, 230, 407,
                      257, 189, 147, 393, 177, 275, 204, 159, 189, 401)

In [44]:
loser1['total_matches'] = loser1['matches_lost'] + loser1['matches_won']

In [45]:
loser1['losing_perc'] = loser1['matches_lost'] / loser1['total_matches'] 

In [46]:
loser1

Unnamed: 0,loser_name,matches_lost,matches_won,total_matches,losing_perc
0,Feliciano Lopez,311,348,659,0.471927
1,Jarkko Nieminen,301,359,660,0.456061
2,Jurgen Melzer,300,326,626,0.479233
3,Nikolay Davydenko,298,463,761,0.39159
4,David Ferrer,289,597,886,0.326185
5,Mikhail Youzhny,288,399,687,0.419214
6,Andreas Seppi,282,404,686,0.411079
7,Fernando Verdasco,282,264,546,0.516484
8,Tommy Robredo,270,429,699,0.386266
9,Philipp Kohlschreiber,264,330,594,0.444444


In [47]:
loser1 = loser1[(loser1['losing_perc'] > .472)]

In [48]:
loser1

Unnamed: 0,loser_name,matches_lost,matches_won,total_matches,losing_perc
2,Jurgen Melzer,300,326,626,0.479233
7,Fernando Verdasco,282,264,546,0.516484
12,Guillermo Garcia Lopez,250,218,468,0.534188
13,Julien Benneteau,249,243,492,0.506098
14,Paul Henri Mathieu,244,223,467,0.522484
15,Victor Hanescu,241,197,438,0.550228
17,Juan Monaco,237,210,447,0.530201
20,Gilles Simon,229,236,465,0.492473
21,Olivier Rochus,229,200,429,0.5338
25,Filippo Volandri,218,180,398,0.547739


In [49]:
a1 = atp_small[(atp_small['loser_name'] == 'Andreas Seppi')].iloc[:,40:49].mean()
a1 = pd.DataFrame(a1).reset_index().set_index('index').T

In [50]:
b1 = atp_small[(atp_small['loser_name'] == 'Guillermo Garcia Lopez')].iloc[:,40:49].mean()
b1 = pd.DataFrame(b1).reset_index().set_index('index').T

In [51]:
c1 = atp_small[(atp_small['loser_name'] == 'Julien Benneteau')].iloc[:,40:49].mean()
c1 = pd.DataFrame(c1).reset_index().set_index('index').T

In [52]:
d1 = atp_small[(atp_small['loser_name'] == 'Paul Henri Mathieu')].iloc[:,40:49].mean()
d1 = pd.DataFrame(d1).reset_index().set_index('index').T

In [53]:
e1 = atp_small[(atp_small['loser_name'] == 'Victor Hanescu')].iloc[:,40:49].mean()
e1 = pd.DataFrame(e1).reset_index().set_index('index').T

In [54]:
f1 = atp_small[(atp_small['loser_name'] == 'Albert Montanes')].iloc[:,40:49].mean()
f1 = pd.DataFrame(f1).reset_index().set_index('index').T

In [55]:
g1 = atp_small[(atp_small['loser_name'] == 'Olivier Rochus')].iloc[:,40:49].mean()
g1 = pd.DataFrame(g1).reset_index().set_index('index').T

In [56]:
h1 = atp_small[(atp_small['loser_name'] == 'Gilles Simon')].iloc[:,40:49].mean()
h1 = pd.DataFrame(h1).reset_index().set_index('index').T

In [57]:
i1 = atp_small[(atp_small['loser_name'] == 'Janko Tipsarevic')].iloc[:,40:49].mean()
i1 = pd.DataFrame(i1).reset_index().set_index('index').T

In [58]:
j1 = atp_small[(atp_small['loser_name'] == 'Florian Mayer')].iloc[:,40:49].mean()
j1 = pd.DataFrame(j1).reset_index().set_index('index').T

In [59]:
k1 = wta_dataset[(wta_dataset['loser_name'] == 'Anabel Medina Garrigues')].iloc[:,40:49].mean()
k1 = pd.DataFrame(k1).reset_index().set_index('index').T

In [60]:
l1 = wta_dataset[(wta_dataset['loser_name'] == 'Klara Koukalova')].iloc[:,40:49].mean()
l1 = pd.DataFrame(l1).reset_index().set_index('index').T

In [61]:
m1 = wta_dataset[(wta_dataset['loser_name'] == 'Iveta Benesova')].iloc[:,40:49].mean()
m1 = pd.DataFrame(m1).reset_index().set_index('index').T

In [62]:
n1 = wta_dataset[(wta_dataset['loser_name'] == 'Svetlana Kuznetsova')].iloc[:,40:49].mean()
n1 = pd.DataFrame(n1).reset_index().set_index('index').T

In [63]:
o1 = wta_dataset[(wta_dataset['loser_name'] == 'Elena Vesnina')].iloc[:,40:49].mean()
o1 = pd.DataFrame(o1).reset_index().set_index('index').T

In [64]:
p1 = wta_dataset[(wta_dataset['loser_name'] == 'Tsvetana Pironkova')].iloc[:,40:49].mean()
p1 = pd.DataFrame(p1).reset_index().set_index('index').T

In [65]:
q1 = wta_dataset[(wta_dataset['loser_name'] == 'Virginie Razzano')].iloc[:,40:49].mean()
q1 = pd.DataFrame(q1).reset_index().set_index('index').T

In [66]:
r1 = wta_dataset[(wta_dataset['loser_name'] == 'Gisela Dulko')].iloc[:,40:49].mean()
r1 = pd.DataFrame(r1).reset_index().set_index('index').T

In [67]:
s1 = wta_dataset[(wta_dataset['loser_name'] == 'Eleni Daniilidou')].iloc[:,40:49].mean()
s1 = pd.DataFrame(s1).reset_index().set_index('index').T

In [68]:
t1 = wta_dataset[(wta_dataset['loser_name'] == 'Alize Cornet')].iloc[:,40:49].mean()
t1 = pd.DataFrame(t1).reset_index().set_index('index').T

In [84]:
stats1 = pd.concat([a1,b1,c1,d1,e1,f1,g1,h1,i1,j1,k1,l1,m1,n1,o1,p1,q1,r1,s1,t1]).reset_index().drop('index', 1)
stats1.head()

index,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,3.847015,3.052239,78.940299,45.085821,29.843284,15.873134,11.94403,4.757463,8.593284
1,2.955823,4.188755,78.634538,46.698795,30.064257,14.293173,11.839357,4.891566,9.056225
2,4.788618,3.552846,79.853659,51.215447,33.353659,12.51626,12.170732,4.605691,8.638211
3,4.021097,3.362869,78.341772,42.696203,28.78481,17.168776,11.881857,4.472574,8.067511
4,4.276018,1.80543,81.642534,56.402715,37.022624,11.638009,12.425339,4.628959,8.244344


In [85]:
loser1

Unnamed: 0,loser_name,matches_lost,matches_won,total_matches,losing_perc
2,Jurgen Melzer,300,326,626,0.479233
7,Fernando Verdasco,282,264,546,0.516484
12,Guillermo Garcia Lopez,250,218,468,0.534188
13,Julien Benneteau,249,243,492,0.506098
14,Paul Henri Mathieu,244,223,467,0.522484
15,Victor Hanescu,241,197,438,0.550228
17,Juan Monaco,237,210,447,0.530201
20,Gilles Simon,229,236,465,0.492473
21,Olivier Rochus,229,200,429,0.5338
25,Filippo Volandri,218,180,398,0.547739


In [86]:
stats1

index,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,3.847015,3.052239,78.940299,45.085821,29.843284,15.873134,11.94403,4.757463,8.593284
1,2.955823,4.188755,78.634538,46.698795,30.064257,14.293173,11.839357,4.891566,9.056225
2,4.788618,3.552846,79.853659,51.215447,33.353659,12.51626,12.170732,4.605691,8.638211
3,4.021097,3.362869,78.341772,42.696203,28.78481,17.168776,11.881857,4.472574,8.067511
4,4.276018,1.80543,81.642534,56.402715,37.022624,11.638009,12.425339,4.628959,8.244344
5,2.729958,3.679325,74.177215,43.85654,28.122363,13.603376,11.270042,4.57384,8.658228
6,1.411215,2.724299,80.042056,48.962617,30.200935,14.691589,11.85514,5.158879,9.317757
7,3.841629,2.303167,78.701357,43.9819,28.438914,16.39819,11.936652,4.904977,9.067873
8,5.916256,2.812808,79.871921,44.758621,31.064039,16.35468,12.054187,4.596059,8.108374
9,3.291457,2.045226,72.61809,43.929648,28.020101,12.79397,11.115578,4.050251,8.050251


In [89]:
RQ21 = pd.concat([loser1, stats1], axis = 1)
RQ21

Unnamed: 0,loser_name,matches_lost,matches_won,total_matches,losing_perc,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,,,,,,3.847015,3.052239,78.940299,45.085821,29.843284,15.873134,11.94403,4.757463,8.593284
1,,,,,,2.955823,4.188755,78.634538,46.698795,30.064257,14.293173,11.839357,4.891566,9.056225
2,Jurgen Melzer,300.0,326.0,626.0,0.479233,4.788618,3.552846,79.853659,51.215447,33.353659,12.51626,12.170732,4.605691,8.638211
3,,,,,,4.021097,3.362869,78.341772,42.696203,28.78481,17.168776,11.881857,4.472574,8.067511
4,,,,,,4.276018,1.80543,81.642534,56.402715,37.022624,11.638009,12.425339,4.628959,8.244344
5,,,,,,2.729958,3.679325,74.177215,43.85654,28.122363,13.603376,11.270042,4.57384,8.658228
6,,,,,,1.411215,2.724299,80.042056,48.962617,30.200935,14.691589,11.85514,5.158879,9.317757
7,Fernando Verdasco,282.0,264.0,546.0,0.516484,3.841629,2.303167,78.701357,43.9819,28.438914,16.39819,11.936652,4.904977,9.067873
8,,,,,,5.916256,2.812808,79.871921,44.758621,31.064039,16.35468,12.054187,4.596059,8.108374
9,,,,,,3.291457,2.045226,72.61809,43.929648,28.020101,12.79397,11.115578,4.050251,8.050251


In [72]:
RQ21.to_csv('../data/new_loser_table', index=False)

In [74]:
rq21

Feliciano Lopez            311
Jarkko Nieminen            301
Jurgen Melzer              300
Nikolay Davydenko          298
David Ferrer               289
Mikhail Youzhny            288
Andreas Seppi              282
Fernando Verdasco          282
Tommy Robredo              270
Philipp Kohlschreiber      264
Tomas Berdych              263
Radek Stepanek             262
Guillermo Garcia Lopez     250
Julien Benneteau           249
Paul Henri Mathieu         244
Victor Hanescu             241
Albert Montanes            237
Juan Monaco                237
Ivo Karlovic               232
Igor Andreev               229
Gilles Simon               229
Olivier Rochus             229
Richard Gasquet            227
Nicolas Almagro            224
Janko Tipsarevic           218
Filippo Volandri           218
Stanislas Wawrinka         216
James Blake                215
Juan Ignacio Chela         215
Florian Mayer              211
Daniela Hantuchova         298
Francesca Schiavone        290
Jelena J

In [None]:
#mens01 = atp_small['loser_name'].value_counts().head(30)

In [None]:
#womens01 = wta_dataset['loser_name'].value_counts().head(30)

In [None]:
#both2 = [mens01, womens01]
#rq21 = pd.concat(both2)

In [None]:
#loser1 = pd.DataFrame({'col':rq21}).reset_index()
#loser1

In [None]:
#loser1.columns = ['loser_name', 'matches_lost']

In [None]:
#men_and_women[(men_and_women['winner_name'] == 'Feliciano Lopez')]['tourney_id'].count()

In [None]:
#loser1['matches_won']=(348, 359, 326, 463, 597, 399, 404, 264, 429, 330,391, 380, 506, 295, 402, 257, 412, 365, 402, 288)

In [None]:
#loser1['total_matches'] = loser1['matches_lost'] + loser1['matches_won']

In [None]:
#loser1['losing_perc'] = loser1['matches_lost'] / loser1['total_matches'] 

In [None]:
#loser1.head()

In [None]:
#a1 = atp_small[(atp_small['loser_name'] == 'Feliciano Lopez')].iloc[:,40:49].mean()
#a1 = pd.DataFrame(a1).reset_index().set_index('index').T

In [None]:
#b1 = atp_small[(atp_small['loser_name'] == 'Jarkko Nieminen')].iloc[:,40:49].mean()
#b1 = pd.DataFrame(b1).reset_index().set_index('index').T

In [None]:
#c1 = atp_small[(atp_small['loser_name'] == 'Jurgen Melzer')].iloc[:,40:49].mean()
#c1 = pd.DataFrame(c1).reset_index().set_index('index').T

In [None]:
#d1 = atp_small[(atp_small['loser_name'] == 'Nikolay Davydenko')].iloc[:,40:49].mean()
#d1 = pd.DataFrame(d1).reset_index().set_index('index').T

In [None]:
#e1 = atp_small[(atp_small['loser_name'] == 'David Ferrer')].iloc[:,40:49].mean()
#e1 = pd.DataFrame(e1).reset_index().set_index('index').T

In [None]:
#f1 = atp_small[(atp_small['loser_name'] == 'Mikhail Youzhny')].iloc[:,40:49].mean()
#f1 = pd.DataFrame(f1).reset_index().set_index('index').T

In [None]:
#g1 = atp_small[(atp_small['loser_name'] == 'Fernando Verdasco')].iloc[:,40:49].mean()
#g1 = pd.DataFrame(g1).reset_index().set_index('index').T

In [None]:
#h1 = atp_small[(atp_small['loser_name'] == 'Andreas Seppi')].iloc[:,40:49].mean()
#h1 = pd.DataFrame(h1).reset_index().set_index('index').T

In [None]:
#i1 = atp_small[(atp_small['loser_name'] == 'Tommy Robredo')].iloc[:,40:49].mean()
#i1 = pd.DataFrame(i1).reset_index().set_index('index').T

In [None]:
#j1 = atp_small[(atp_small['loser_name'] == 'Philipp Kohlschreiber')].iloc[:,40:49].mean()
#j1 = pd.DataFrame(j1).reset_index().set_index('index').T

In [None]:
#k1 = wta_dataset[(wta_dataset['loser_name'] == 'Daniela Hantuchova')].iloc[:,40:49].mean()
#k1 = pd.DataFrame(k1).reset_index().set_index('index').T

In [None]:
#l1 = wta_dataset[(wta_dataset['loser_name'] == 'Francesca Schiavone')].iloc[:,40:49].mean()
#l1 = pd.DataFrame(l1).reset_index().set_index('index').T

In [None]:
#m1 = wta_dataset[(wta_dataset['loser_name'] == 'Jelena Jankovic')].iloc[:,40:49].mean()
#m1 = pd.DataFrame(m1).reset_index().set_index('index').T

In [None]:
#n1 = wta_dataset[(wta_dataset['loser_name'] == 'Anabel Medina Garrigues')].iloc[:,40:49].mean()
#n1 = pd.DataFrame(n1).reset_index().set_index('index').T

In [None]:
#o1 = wta_dataset[(wta_dataset['loser_name'] == 'Marion Bartoli')].iloc[:,40:49].mean()
#o1 = pd.DataFrame(o1).reset_index().set_index('index').T

In [None]:
#p1 = wta_dataset[(wta_dataset['loser_name'] == 'Klara Koukalova')].iloc[:,40:49].mean()
#p1 = pd.DataFrame(p1).reset_index().set_index('index').T

In [None]:
#q1 = wta_dataset[(wta_dataset['loser_name'] == 'Flavia Pennetta')].iloc[:,40:49].mean()
#q1 = pd.DataFrame(q1).reset_index().set_index('index').T

In [None]:
#r1 = wta_dataset[(wta_dataset['loser_name'] == 'Samantha Stosur')].iloc[:,40:49].mean()
#r1 = pd.DataFrame(r1).reset_index().set_index('index').T

In [None]:
#s1 = wta_dataset[(wta_dataset['loser_name'] == 'Nadia Petrova')].iloc[:,40:49].mean()
#s1 = pd.DataFrame(s1).reset_index().set_index('index').T

In [None]:
#t1 = wta_dataset[(wta_dataset['loser_name'] == 'Maria Kirilenko')].iloc[:,40:49].mean()
#t1 = pd.DataFrame(t1).reset_index().set_index('index').T

In [None]:
#stats1 = pd.concat([a1,b1,c1,d1,e1,f1,g1,h1,i1,j1,k1,l1,m1,n1,o1,p1,q1,r1,s1,t1]).reset_index().drop('index', 1)
#stats1.head()

In [None]:
#RQ2_losers = pd.concat([loser1, stats1], axis = 1)
#RQ2_losers

In [None]:
#RQ2_losers['l_df'].mean()

## Saving these 'master' dataframes

In [None]:
#RQ2.to_csv('../data/RQ2', index=False)

In [None]:
#RQ2_losers.to_csv('../data/RQ2_losers', index=False)