<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_12_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/baseball.csv'
data = pd.read_csv(url)

In [89]:
# [1]

# select 2015 and 2016 data
data1 = data[(data.yearID == 2015) | (data.yearID == 2016)]

# select moved player
player = data1.groupby('playerID').agg({'teamID': 'nunique'})
player = player.assign(moved=player.teamID.apply(lambda x: 1 if x == 2 else 0))
player = player[player.moved == 1]

# filter data with players in the moved players
data1 = data1[data1.playerID.isin(player.index)]

# filter data with AB > 400
data1 = data1[data1.AB > 400]

# both years' AB > 400 (year unique count should be 2)
data1 = data1.groupby('playerID').agg({'yearID': 'nunique'})
data1 = data1[data1.yearID == 2]

# count players
count = data1.count()

print(f'Answer [1] : {count.iloc[0]}')

Answer [1] : 17


In [90]:
# [2] corr

# filter data with selected players
data2 = data[data.playerID.isin(data1.index)][['playerID', 'yearID', 'H', 'AB',
                                               'RBI', 'SH', 'SF']]

# merge 2015 data and 2016 data in column
data2015 = data2[data2.yearID == 2015]
data2016 = data2[data2.yearID == 2016]
data_con = pd.merge(data2015, data2016, how='inner', on='playerID',
                    suffixes=('_15', '_16'))

# create calculated columns
data_con = data_con.assign(HR15=data_con.apply(lambda x: x.H_15 / x.AB_15,
                                               axis=1))
data_con = data_con.assign(SS15=data_con.apply(lambda x: x.SH_15 + x.SF_15,
                                               axis=1))
data_con = data_con.assign(RC16=data_con.apply(lambda x: x.RBI_16 / x.RBI_15,
                                               axis=1))

# corr
corr1 = data_con[['HR15', 'RC16']].corr(method='pearson')
corr2 = data_con[['SS15', 'RC16']].corr(method='pearson')

print(f'Answer [2] : {corr1.RC16.iloc[0]:.2f}, {corr2.RC16.iloc[0]:.2f}')

Answer [2] : -0.37, 0.42


In [91]:
# [3] ttest_ind

from scipy.stats import ttest_ind
import math

# calculate accumulated AB, debut year, mean RBI
data3 = data.groupby('playerID').agg({'yearID': 'min', 'AB': 'sum',
                                     'RBI': 'mean'})

# filter by AB
data3 = data3[data3.AB >= 200]

# define group by debut year
data3 = data3.assign(group=data3.yearID.apply(lambda x:
                                              'A' if x < 2014 else 'B'))

# perform T-test
t_val, p_val = ttest_ind(data3[data3.group == 'A'].RBI,
                         data3[data3.group == 'B'].RBI, equal_var=False)

print(f'Answer [3] : {math.floor(p_val * 1000) / 1000}')

Answer [3] : 0.328


In [92]:
# [4] apriori, association_rules

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# collect unique teamID by playerID
data4 = data.groupby('playerID').agg({'teamID': lambda x: list(set(x))})

# filter by number of teamID >= 2
data4 = data4.assign(cnt=data4.teamID.apply(lambda x: len(x)))
data4 = data4[data4.cnt >= 2]

# create teamID combination in list
teams = list(data4.teamID)

# convert data into pivot format
te = TransactionEncoder()
te_array = te.fit_transform(teams)
te_df = pd.DataFrame(te_array, columns=te.columns_)

# find frequent items with min support
freq_items = apriori(te_df, min_support=0.0015, use_colnames=True)

# perform association_rules
asso = association_rules(freq_items, metric='confidence', min_threshold=0.5)

# count number of rules
cnt = len(asso.index)

print(f'Answer [4] : {cnt}')

Answer [4] : 65


In [93]:
# [5] rank, LogisticRegression

from sklearn.linear_model import LogisticRegression

# filter by AB >= 400 and year >= 2014
data5 = data[(data.AB >= 400) & (data.yearID >= 2014)]

# mark top 30 tayul or not
data5 = data5.assign(hitRate=data5.apply(lambda x: x.H / x.AB, axis=1))
data5 = data5.assign(hitRank=data5.groupby('yearID').hitRate.rank(method='dense',
                                                               ascending=False))
                                      # in case of : 1 1st, 2 2nds, 1 3rd
                                      # dense : 1, 2, 2, 3 
                                      # min   : 1, 2, 2, 4
                                      # max   : 1. 3, 3, 4
data5 = data5.assign(top30=data5.hitRank.apply(lambda x: 1 if x <= 30 else 0))

# insert 2 calculated columns
data5 = data5.assign(GC=data5.apply(lambda x: x.HBP + x.BB, axis=1))
data5 = data5.assign(JT=data5.apply(lambda x: (x.H + x.X2B + x.X3B * 2 +
                                               x.HR * 3) / x.AB, axis=1))

# mark if next year's top30 is 1 or not
def getNextTop30(player, year):
    nextYear = data5[(data5.playerID == player) & (data5.yearID == year + 1)]
    if len(nextYear) != 0:
        return nextYear.top30.iloc[0]

data5 = data5.assign(target=data5.apply(lambda x:
                                        getNextTop30(x.playerID, x.yearID),
                                        axis=1))

# prepare training and test set
X_cols = ['RBI', 'GC', 'JT', 'SO']
train = data5[data5.yearID == 2014].dropna()
train_X = train[X_cols]
train_y = train.target
test = data5[data5.yearID == 2015].dropna()
test_X = test[X_cols]
test_y = test.target

# train
lr = LogisticRegression(C=100000, random_state=1234, penalty='l2',
                        solver='newton-cg')
model = lr.fit(train_X, train_y)

# predict
pred = model.predict_proba(test_X)
pred_df = pd.DataFrame(pred, columns=['neg', 'pos'])
pred_df = pred_df.assign(esti=pred_df.pos.apply(lambda x: 1 if x >= 0.18 else 0))

# merget predict result with fact
test_y = pd.DataFrame(test_y)
test_y.reset_index(inplace=True)
pred_df = pd.merge(pred_df, test_y, left_index=True, right_index=True)

# find true positive
pred_df = pred_df.assign(TP=pred_df.apply(lambda x: 1 if x.esti * x.target == 1
                                                      else 0, axis=1))

print(f'Answer [5] : {pred_df.TP.sum()}')

Answer [5] : 21
