<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_15_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

url_1 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/16Part1.csv'
url_2 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/16Part2.csv'
data_1 = pd.read_csv(url_1)
data_2 = pd.read_csv(url_2)

In [0]:
# [0]

# drop na in Year
dataA = data_1.dropna(subset=['Year'])

# remove records with Year < 1980
dataA = dataA[dataA.Year >= 1980]

# fillna
targetCols = dataA.columns[6:]
dataA[targetCols] = dataA[targetCols].fillna(value=0)

# dropna for data B
dataB = data_2.dropna()

# convert height and weight
def convert(x):
    feet, inch = x.split('-')
    return round(int(feet) * 30.5 + int(inch) * 2.54)

dataB = dataB.assign(height=dataB.height.apply(lambda x: convert(x)))
dataB = dataB.assign(weight=dataB.weight.apply(lambda x: round(x / 2.2046)))

# remove players with same name
players_dup = dataA.groupby(['Year', 'Player']).agg({'Age': 'nunique'})
players_dup = players_dup[players_dup.Age > 1]
players_dup = players_dup.groupby('Player').agg({'Age': 'count'}) # agg doesn't matter
dataA = dataA[~dataA.Player.isin(players_dup.index)]

# filter by Year condition
players_yMin = dataA.groupby('Player').agg({'Year': 'min'})
players_yMax = dataA.groupby('Player').agg({'Year': 'max'})
players_year = pd.merge(players_yMin, players_yMax, left_index=True,
                        right_index=True)
players_year = players_year.assign(period=players_year.apply(lambda x:
                             x.Year_y - x.Year_x, axis=1)) # should be y - x + 1
players_year = players_year[(players_year.period > 20) |
                            (players_year.period < 3)]
dataA = dataA[~dataA.Player.isin(players_year.index)]

# merge height and weight
base = pd.merge(dataA, dataB[{'name', 'height', 'weight'}], left_on='Player',
                right_on='name')

In [72]:
# [5]

# keep player level data for Age and G
data5 = base.groupby(['Tm', 'Year', 'Player']).agg({'Age': 'last', 'G': 'last'})

# get max age team and year
meanAge = data5.groupby(['Tm', 'Year']).agg({'Age': 'mean'})
meanAge = meanAge.sort_values('Age', ascending=False)
team = meanAge.index[0][0]
year = meanAge.index[0][1]

# get oldest (and high G) player in the team and year
player = data5.loc[team, year].sort_values(['Age', 'G'],
                                           ascending=[False, False]).index[0]

print(f'Answer [5] : {player}')

Answer [5] : Tim Duncan


In [73]:
# [6] ttest_ind

from scipy.stats import ttest_ind
import math

# calculate number of positions per player
data6 = base.groupby('Player').agg({'Pos': 'nunique', 'height': 'last'})

# ttest
t_val, p_val = ttest_ind(data6[data6.Pos == 1].height,
                         data6[data6.Pos > 1].height, equal_var=False)

print(f'Answer [6] : {math.floor(p_val * 10000) / 10000}')

Answer [6] : 0.1676


In [0]:
# [7 ~ 8 data processing]

# prepare datasets
team_cnt = base.groupby(['Year', 'Player']).agg({'Tm': 'nunique'})
team_cnt.reset_index(inplace=True)

data7 = pd.merge(base, team_cnt, how='inner', on=['Year', 'Player'])
data7 = data7.drop(labels=['Index', 'name'], axis=1)

mov_season = data7[data7.Tm_y > 1]
prev_season = data7[data7.Tm_y == 1]

In [75]:
# [7]

import math

# create sum of ThreeP and ThreePA per Year and Player
three = mov_season.groupby(['Year', 'Player']).agg({'ThreeP': 'sum',
                                                    'ThreePA': 'sum'})

# filter by ThreePA >= 10
three = three[three.ThreePA >= 10]

# calculate Success rate
three = three.assign(SR=three.apply(lambda x: x.ThreeP / x.ThreePA, axis=1))

# get mean of Success rate
meanSR = three.SR.mean()

print(f'Answer [7] : {math.floor(meanSR * 10000) / 10000}')

Answer [7] : 0.3002


In [76]:
# [8] get_dummies, LinearRegression, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math

# drop unnecessary columns
data8 = prev_season.drop(labels=['Tm_x', 'G', 'TOVP', 'USGP', 'FG', 'FGA',
                                 'FGP', 'STL', 'Tm_y'], axis=1)

# create dummy variables
data8 = pd.get_dummies(data8, columns=['Pos'], drop_first=True)

# calculate success rate
def SR(x, y):
    if y == 0:
        return 0
    else:
        return x / y

data8 = data8.assign(TwoSR=data8.apply(lambda x: SR(x.TwoP, x.TwoPA), axis=1))
data8 = data8.assign(ThreeSR=data8.apply(lambda x: SR(x.ThreeP, x.ThreePA),
                                         axis=1))
data8 = data8.assign(FTSR=data8.apply(lambda x: SR(x.FT, x.FTA), axis=1))

# select X variables
cols = data8.columns
X_cols = list(cols[2:4]) + list(cols[10:11]) + list(cols[13:])

# create train set
train_X = data8[data8.Year <= 2000][X_cols]
train_y = data8[data8.Year <= 2000].PTS

# train
lr = LinearRegression().fit(train_X, train_y)

# create test set
test_X = data8[data8.Year >= 2001][X_cols]
test_y = data8[data8.Year >= 2001].PTS

# predict
pred = lr.predict(test_X)

# find MSE
mse = mean_squared_error(test_y, pred)

print(f'Answer [8] : {math.floor(mse)}')

Answer [8] : 38384
