<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_13_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/13.csv'
data = pd.read_csv(url)

In [0]:
# [0] drop_duplicate, sort_index(DONOT use sort_index), fillna

# drop na for Year
base = data.dropna(subset=['Year'])

# drop duplicated year and keep latest
base = base.sort_values(by=['Name', 'Platform', 'Year'],
                        ascending=[True, True, True])
base = base.drop_duplicates(subset=['Name', 'Platform'], keep='last')

# replace na to 0
targetCols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'] 
base[targetCols] = base[targetCols].fillna(0)

# add Global_sales
base = base.assign(Global_sales=base[targetCols].sum(axis=1))

# filter by platform with Global_sales >= 20000
platforms = base.groupby('Platform').agg({'Global_sales': 'sum'})
platforms = platforms[platforms.Global_sales >= 20000]
base = base[base.Platform.isin(platforms.index)]

In [83]:
# [1] KNN, set_index

from sklearn.neighbors import KNeighborsClassifier

# make check point type
maxG = base.Global_sales.max()
minG = base.Global_sales.min()
medG = base.Global_sales.median()

def check(x):
    if (x == maxG) | (x == minG):
        return 'Type1'
    elif x == medG:
        return 'type2'
    else:
        return 'Not Check Point'

base = base.assign(check=base.Global_sales.apply(lambda x: check(x)))

# perform KNN (train with Type1 and Type2, predict with all data)
train_X = base[base.check != 'Not Check Point'][targetCols]
train_y = base[base.check != 'Not Check Point'].check.to_numpy()
knn = KNeighborsClassifier(n_neighbors=1).fit(train_X, train_y)

# find closest point's Check Point Type
test_X = base[targetCols]
types = knn.predict(test_X)
types = pd.DataFrame(types, columns=['outlier']) # convert to DataFrame
base.reset_index(inplace=True) # reset index to match with knn result table
base1 = pd.concat([base, types], axis=1) # merge knn result

# remove outliers
base1 = base1[base1.outlier != 'Type1']

# calculate mean of Global_sales
meanG = base1.Global_sales.mean()

print(f'Answer [1] : {meanG:.3f}')

# reindex with original index
base1 = base1.set_index(keys='index')

Answer [1] : 74.201


In [84]:
# [2]

# get number of unique Platforms per Name and Genre
data2 = base1.groupby('Name').agg({'Platform': 'nunique', 'Genre': 'last'})

# get average number of platforms per Genre
data2 = data2.groupby('Genre').agg({'Platform': 'mean'})

# get max and min Genre
maxGenre = data2[data2.Platform == data2.Platform.max()].index[0]
minGenre = data2[data2.Platform == data2.Platform.min()].index[0]

print(f'Answer [2] : {maxGenre}, {minGenre}')

Answer [2] : Action, Puzzle


In [85]:
# [3] ttest_rel

from scipy.stats import ttest_rel

# select games supporting both of PC and X360
games = base1.groupby('Name').agg({'Platform': lambda x: list(set(x))})
games = games.assign(contain=games.Platform.apply(lambda x:
                                       1 if ('PC' in x) & ('X360' in x) else 0))
games = games[games.contain == 1]

# fiter data by Name in games
data3 = base1[base1.Name.isin(games.index)]

# perform ttest_ind
t_val, p_val = ttest_rel(data3[data3.Platform == 'PC'].Global_sales,
                         data3[data3.Platform == 'X360'].Global_sales)

#
print(f'Answer [3] : {abs(t_val):.3f}')

Answer [3] : 7.772


In [86]:
# [4] LogisticRegressioin, get_dummies

from sklearn.linear_model import LogisticRegression

# create dummy variables
data4 = pd.get_dummies(base1, columns=['Platform'], drop_first=True)

# create y value
data4 = data4.assign(sports=data4.Genre.apply(lambda x: 1 if x == 'Sports'
                                                          else 0))

# set X variables
X_cols = list(data4.columns[4:8]) + list(data4.columns[12:-1])

# set train and test sets
train_X = data4[data4.XGRP == 'A'][X_cols]
train_y = data4[data4.XGRP == 'A'].sports
test_X = data4[data4.XGRP == 'B'][X_cols]
test_y = data4[data4.XGRP == 'B'].sports

# train
lr = LogisticRegression(C=100000, random_state=1234, penalty='l2',
                        solver='newton-cg').fit(train_X, train_y)

# predict
pred = lr.predict_proba(test_X)
pred_df = pd.DataFrame(pred, columns=['neg', 'pos']) # to DataFrame

# calculate odds
pred_df = pred_df.assign(odds=pred_df.pos.apply(lambda x: x / (1 - x)))

# estimate
pred_df = pred_df.assign(esti=pred_df.odds.apply(lambda x: 1 if x >= 0.12
                                                             else 0))

# merge estimate and facts
test_y = pd.DataFrame(test_y).reset_index()
pred_df = pd.merge(pred_df, test_y.sports, left_index=True, right_index=True)

# calculate accuracy
pred_df = pred_df.assign(true=pred_df.apply(lambda x: 1 if x.esti == x.sports
                                                        else 0, axis=1))
accuracy = pred_df.true.sum() / pred_df.true.count()

print(f'Answer [4] : {accuracy:.3f}')

Answer [4] : 0.383
