In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_baseball = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/baseball.csv'
data = pd.read_csv(url_baseball)

### [1]

In [3]:
d1 = data[(data['yearID'] == 2015) | (data['yearID'] == 2016)]

In [4]:
players = d1.groupby('playerID').agg({'teamID': 'nunique'})
players = players[players['teamID'] > 1]

mvd_player = players.index

d1 = d1[d1['playerID'].isin(mvd_player)]

In [5]:
# 방법 1
d1_15 = d1[(d1['yearID'] == 2015) & (d1['AB'] > 400)]['playerID']
d1_16 = d1[(d1['yearID'] == 2016)& (d1['AB'] > 400)]['playerID']
d1_400 = pd.merge(d1_15, d1_16, how='inner')

In [6]:
# 방법 2
# d1_400 = d1[d1['AB'] > 400]
# d1_400 = d1_400.groupby('playerID').agg({'yearID': 'count'})
# d1_400 = d1_400[d1_400['yearID'] >= 2]

In [7]:
print(f"{len(d1_400)}")

17


### [2]

In [8]:
d2 = d1[d1['playerID'].isin(d1_400['playerID'])]

In [9]:
d2_15 = d2[d2['yearID'] == 2015]
d2_16 = d2[d2['yearID'] == 2016]

In [10]:
d2 = pd.merge(d2_15, d2_16, on='playerID', how='inner', suffixes=('_15', '_16'))

In [11]:
d2 = d2.assign(HR=d2['H_15'] / d2['AB_15'])
d2 = d2.assign(CHRBI=d2['RBI_16'] / d2['RBI_15'])
d2 = d2.assign(SS=d2['SH_15'] + d2['SF_15'])

In [12]:
ans1 = d2[['HR', 'CHRBI']].corr(method='pearson').iloc[0, 1]
ans2 = d2[['SS', 'CHRBI']].corr(method='pearson').iloc[0, 1]

In [13]:
print(f"{ans1:.2f}, {ans2:.2f}")

-0.37, 0.42


### [3]

In [14]:
d3 = data.groupby('playerID').agg({'yearID': 'min', 'AB': 'sum', 'RBI': 'mean'})
d3 = d3[d3['AB'] >= 200]
d3 = d3.assign(GRP=d3['yearID'].apply(lambda x: 'A' if x <= 2013 else 'B'))

In [15]:
d3_A = d3[d3['GRP'] == 'A']
d3_B = d3[d3['GRP'] == 'B']

In [16]:
from scipy.stats import ttest_ind

t_val, p_val = ttest_ind(d3_A['RBI'], d3_B['RBI'], equal_var=False)

print(f"{math.floor(p_val * 1000) / 1000:.3f}")

0.328


### [4]

In [17]:
d4_teamcnt = data.groupby('playerID').agg({'teamID': 'nunique'})
d4_teamcnt = d4_teamcnt[d4_teamcnt.teamID >= 2].reset_index()
d4 = data[data.playerID.isin(d4_teamcnt.playerID)]

In [18]:
pivot = d4.pivot_table(index='playerID', columns='teamID', aggfunc='size', fill_value=0)
pivot = (pivot >= 1) * 1

In [19]:
from mlxtend.frequent_patterns import apriori, association_rules

freq_items = apriori(pivot, min_support=0.0015, use_colnames=True)
ass_rule = association_rules(freq_items, metric='confidence', min_threshold=0.5)

In [20]:
print(f"{ass_rule.shape[0]}")

65


### [5]

In [21]:
d5 = data[(data.AB >= 400) & (data.yearID >= 2014)]

In [22]:
d5 = d5.assign(hitRate=d5.H / d5.AB)
d5 = d5.assign(Rank=d5.groupby('yearID').hitRate.rank(ascending=False,
                                                      method='dense'))
d5 = d5.assign(T30=d5.Rank.apply(lambda x: 'Y' if x <= 30 else 'N'))

In [23]:
d5 = d5.assign(GC=d5.HBP + d5.BB)
d5 = d5.assign(LH=(d5.H + d5.X2B + d5.X3B * 2 + d5.HR * 3) / d5.AB)

In [24]:
def checkNY(player, year):
    NY = d5[(d5.playerID == player) & (d5.yearID == year + 1)]
    if len(NY) != 0:
        return NY.T30.iloc[0]
    
d5 = d5.assign(target=d5.apply(lambda x: checkNY(x.playerID, x.yearID), axis=1))

In [25]:
d5 = d5[~d5.target.isna()]

In [26]:
from sklearn.linear_model import LogisticRegression

X_var = ['RBI', 'GC', 'LH', 'SO']

train = d5[d5.yearID == 2014]
test = d5[d5.yearID == 2015]

model = LogisticRegression(penalty='l2', C=100000, random_state=1234,
                           solver='newton-cg').fit(train[X_var], train.target)
pred = model.predict_proba(test[X_var])

In [27]:
pred = pd.DataFrame(pred)
pred = pred.rename(columns={0: 'N', 1: 'Y'})
pred = pred.assign(esti=pred.Y.apply(lambda x: 'Y' if x >= 0.18 else 'N'))

test_y = test.target.reset_index()

pred = pd.merge(pred, test_y, left_index=True, right_index=True)

In [28]:
print(f"{len(pred[(pred.target == 'Y') & (pred.esti == 'Y')])}")

21
