In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_1 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/16Part1.csv'
url_2 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/16Part2.csv'
data_1 = pd.read_csv(url_1)
data_2 = pd.read_csv(url_2)

### [0]

In [3]:
d1 = data_1.dropna(subset=['Year'])
d1 = d1[d1.Year >= 1980]

In [4]:
d1[d1.columns[6:]] = d1[d1.columns[6:]].fillna(value=0)

In [5]:
d2 = data_2.dropna()

In [6]:
def convert_height(h):
    feet, inch = h.split('-')
    return round(int(feet) * 30.5 + int(inch) * 2.54)

d2 = d2.assign(Hcm=d2.height.apply(lambda x: convert_height(x)))
d2 = d2.assign(Wkg=d2.weight.apply(lambda x: round(x / 2.2046)))

In [7]:
same_name = d1.groupby(['Year', 'Player']).agg({'Age': 'nunique'})
same_name = same_name[same_name.Age >= 2].reset_index()
same_name = same_name.Player.unique()

In [8]:
d1 = d1[~d1.Player.isin(same_name)]

In [9]:
min_year = d1.groupby('Player').agg({'Year': 'min'})
max_year = d1.groupby('Player').agg({'Year': 'max'})

years = pd.merge(min_year, max_year, how='inner', left_index=True,
                 right_index=True)
years = years.assign(yr_range=years.Year_y - years.Year_x)
years = years[(years.yr_range >= 3) & (years.yr_range <= 20)]
d1 = d1[d1.Player.isin(years.index)]

In [10]:
d = pd.merge(d1, d2[['name', 'Hcm', 'Wkg']], how='inner', left_on='Player', right_on='name')

### [5]

In [11]:
mean_age = d.groupby(['Tm', 'Year']).agg({'Age': 'mean'})
mean_age = mean_age.sort_values(by='Age', ascending=False)
h_team = mean_age.index[0][0]
h_year = mean_age.index[0][1]

In [12]:
d5 = d[(d.Tm == h_team) & (d.Year == h_year)].sort_values(by=['Age', 'G'],
                                                       ascending=[False, False])

In [13]:
print(f"{d5.iloc[0, 2]}")

Tim Duncan


### [6]

In [14]:
d6 = d.groupby('Player').agg({'Pos': 'nunique', 'Hcm': 'mean'})

In [15]:
from scipy.stats import ttest_ind

t_val, p_val = ttest_ind(d6[d6.Pos >= 2].Hcm, d6[d6.Pos == 1].Hcm,
                         equal_var=False)

In [16]:
print(f"{math.floor(p_val * 10000) / 10000:.4f}")

0.1676


### [7~8 Prep]

In [17]:
tm_cnt = d.groupby(['Player', 'Year']).agg({'Tm': 'nunique'})
mved = tm_cnt[tm_cnt.Tm > 1].reset_index()

In [18]:
mov = pd.merge(d, mved, how='inner', on=['Player', 'Year'])
mov = mov.drop(labels=['name', 'Tm_y'], axis=1)
mov = mov.rename(columns={'Tm_x': 'Tm'})

In [19]:
prev = d[~d.Index.isin(mov.Index)]
prev = prev.drop(labels=['name'], axis=1)

### [7]

In [20]:
# d7 = mov.assign(ThreeS=mov.ThreeP / mov.ThreePA)
d7 = mov.groupby(['Year', 'Player']).agg({'ThreeP': 'sum', 'ThreePA': 'sum'})
d7 = d7[d7.ThreePA >= 10]
d7 = d7.assign(ThreeS=d7.ThreeP / d7.ThreePA)

print(f"{math.floor(d7.ThreeS.mean() * 10000) / 10000:.4f}")

0.3002


### [8]

In [21]:
d8 = pd.get_dummies(prev, columns=['Pos'], drop_first=True)

In [22]:
d8 = d8.assign(TwoS=d8.TwoP / d8.TwoPA)
d8 = d8.assign(ThreeS=d8.ThreeP / d8.ThreePA)
d8 = d8.assign(FTS=d8.FT / d8.FTA)

d8[['TwoS', 'ThreeS', 'FTS']] = d8[['TwoS', 'ThreeS', 'FTS']].fillna(value=0)

In [23]:
X_var = list(d8.columns[-7:-3]) + list(['Age', 'MP', 'TwoS', 'ThreeS', 'FTS',
                                      'AST', 'Hcm', 'Wkg'])

In [24]:
from sklearn.linear_model import LinearRegression

d8_train = d8[d8.Year <= 2000]

model = LinearRegression().fit(d8_train[X_var], d8_train.PTS)

In [25]:
d8_test = d8[d8.Year >= 2001]

pred = model.predict(d8_test[X_var])
pred = pd.DataFrame(pred, columns=['esti'])

In [26]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(d8_test.PTS, pred.esti)

print(f"{math.floor(mse):.0f}")

38384
