<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# loading data from a csv fle
url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/baseball.csv'
data = pd.read_csv(url)

In [0]:
# [1]

data1 = data[(data['yearID'] == 2015) | (data['yearID'] == 2016)]

def checkChanged(x):
    player = data1[data1['playerID'] == x]
    if len(player) != 2:
        # should be in both 2015 and 2016
        return 0
    else:
        team_2015 = player[player['yearID'] == 2015]['teamID'].values[0]
        team_2016 = player[player['yearID'] == 2016]['teamID'].values[0]
        if team_2015 != team_2016:
            return 1
        else:
            return 0

data1 = data1.assign(Changed=data1['playerID'].apply(lambda x: checkChanged(x)))
result1 = data1[data1['Changed'] == 1]
result1 = result1[result1['AB'] > 400]
result1 = result1.groupby('playerID').count()
result1 = result1[result1['yearID'] == 2]

print(f'Answer [1] : # of players = {len(result1)}')

Answer [1] : # of players = 17


In [0]:
# [2]

data2 = data[data['playerID'].isin(result1.index)]
player_15 = data2[data2['yearID'] == 2015].sort_values('playerID', ascending=True)
player_16 = data2[data2['yearID'] == 2016].sort_values('playerID', ascending=True)
player2 = pd.merge(player_15, player_16, on='playerID')
player2['hit_15'] = player2['H_x'] / player2['AB_x']
player2['sac_15'] = player2['SH_x'] + player2['SF_x']
player2['change'] = player2['RBI_y'] / player2['RBI_x']

hit = player2[['hit_15', 'change']]
sac = player2[['sac_15', 'change']]

corr_hit = hit.corr(method='pearson').iloc[0, 1]
corr_sac = sac.corr(method='pearson').iloc[0, 1]

print(f'Answer [2] : {corr_hit:.2f}, {corr_sac:.2f}')

Answer [2] : -0.37, 0.42


In [0]:
# [3]

from scipy import stats
import math

data3 = data.groupby('playerID').agg({'yearID': 'min', 'AB': 'sum', 'RBI': 'mean'})

data3 = data3[data3['AB'] >= 200]
a_group = data3[data3['yearID'] <= 2013]
b_group = data3[data3['yearID'] >= 2014]

t_value, p_value = stats.ttest_ind(a_group['RBI'], b_group['RBI'], equal_var=False)

print(f'Answer [3] : P-value = {math.floor(p_value * 1000) / 1000}')

Answer [3] : P-value = 0.328


In [0]:
# [4]

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

def func(x):
    return list(set(x))

teams = data.groupby('playerID').agg({'teamID': func})
teams['counts'] = teams['teamID'].apply(lambda x: len(x))
teams = list(teams[teams['counts'] >= 2]['teamID'])

te = TransactionEncoder()
te_array = te.fit(teams).transform(teams)

te_df = pd.DataFrame(te_array, columns=te.columns_)
freq_items = apriori(te_df, min_support=0.0015, use_colnames=True)
asso_rules = association_rules(freq_items, metric='confidence', min_threshold=0.5)

print(f'Answer [4] : No of association rules = {len(asso_rules)}')

Answer [4] : No of association rules = 65


In [0]:
# [5]

from sklearn.linear_model import LogisticRegression

data5 = data[(data['AB'] >= 400) & (data['yearID'] >= 2014)]
data5 = data5.assign(hitR=data5['H'] / data5['AB'])
data5 = data5.assign(rank=data5.groupby('yearID')['hitR'].rank(method='dense',
                                                               ascending=False))
data5 = data5.assign(top30=data5.apply(lambda x: 'Y' if x['rank'] <= 30 else 'N',
                                       axis=1))
data5 = data5.assign(dr=data5['HBP'] + data5['BB'])
data5 = data5.assign(long=(data5['H'] + data5['X2B'] + 2 * data5['X3B'] +
                           3 * data5['HR']) / (data5['AB']))

def nextTop(player, year):
    if year == 2016:
        return 'NA'
    if len(data5[(data5['playerID'] == player) & (data5['yearID'] == year + 1)]) == 0:
        return 'NA'
    if data5[(data5['playerID'] == player) & (data5['yearID'] == year + 1)]\
       ['top30'].values[0] == 'Y':
        return 'Y'
    else:
        return 'N'

data5 = data5.assign(target=data5.apply(lambda x:
                                        nextTop(x['playerID'], x['yearID']),
                                        axis=1))
data5 = data5[data5['target'] != 'NA']

train_x = data5[data5['yearID'] == 2014][['RBI', 'dr', 'long', 'SO']]
train_y = data5[data5['yearID'] == 2014][['target']]

lr = LogisticRegression(random_state=1234, solver='newton-cg', penalty='l2',
                        C=100000)
model = lr.fit(train_x, train_y.squeeze())

test_x = data5[data5['yearID'] == 2015][['RBI', 'dr', 'long', 'SO']]
test_y = data5[data5['yearID'] == 2015][['target']]

pred = model.predict_proba(test_x)
pred_df = pd.DataFrame(pred, columns=['N', 'Y'])
pred_df = pred_df.assign(Pred=pred_df.apply(lambda x: 'Y' if x['Y'] >= 0.18 else 'N',
                                            axis=1))
test_y = test_y.reset_index()
pred_df = pred_df.assign(Fact=test_y['target'])
true_pos = pred_df[(pred_df['Fact'] == 'Y') & (pred_df['Pred'] == 'Y')].shape[0]

print(f'Answer [5] : No of True Positive = {true_pos}')

Answer [5] : No of True Positive = 21
