In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_game = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/13.csv'
data = pd.read_csv(url_game)

### [0]

In [3]:
base = data.dropna(subset=['Year'])

In [4]:
base = base.sort_values(by=['Name', 'Platform', 'Year'],
                        ascending=['True', 'True', 'True'])
base = base.drop_duplicates(subset=['Name', 'Platform'], keep='last')

In [5]:
base[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']] = base[['NA_Sales',
                               'EU_Sales', 'JP_Sales', 'Other_Sales']].fillna(0)

In [6]:
base = base.assign(Global_sales=base[['NA_Sales', 'EU_Sales', 'JP_Sales',
                                     'Other_Sales']].sum(axis=1))

In [7]:
platforms = base.groupby('Platform').agg({'Global_sales': 'sum'})
platforms = platforms[platforms.Global_sales >= 20000]

base = base[base.Platform.isin(platforms.index)]

### [1]

In [8]:
max_sales = base.Global_sales.max()
min_sales = base.Global_sales.min()
med_sales = base.Global_sales.median()

d1 = base.assign(Type=base.Global_sales.apply(lambda x: 'Type1'
                                  if (x == max_sales) | (x == min_sales) else (
                                  'Type2' if x == med_sales else 'Not CP')))

In [9]:
from sklearn.neighbors import KNeighborsClassifier

X_var = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']

train = d1[d1.Type != 'Not CP']
test = d1[d1.Type == 'Not CP']

model = KNeighborsClassifier(n_neighbors=1).fit(train[X_var], train.Type)
pred = model.predict(test[X_var])
pred = pd.DataFrame(pred, columns=['outlier'])

In [10]:
test = test.reset_index()
test = test.assign(Type=pred.outlier)
test = test.set_index(keys='index')

In [11]:
d = pd.concat([train, test], axis=0)
d = d[d.Type == 'Type2']

In [12]:
print(f"{d.Global_sales.mean():.3f}")

74.201


### [2]

In [13]:
d2 = d.groupby('Name').agg({'Genre': 'last', 'Platform': 'nunique'})
d2 = d2.reset_index()

d2_genre = d2.groupby('Genre').agg({'Platform': 'mean'}).sort_values(
                                                  by='Platform', ascending=True)

In [14]:
max_genre = d2_genre.index[-1]
min_genre = d2_genre.index[0]

print(f"{max_genre}, {min_genre}")

Action, Puzzle


### [3]

In [15]:
d3 = d.groupby('Name').agg({'Platform': lambda x: set(list(x))})

In [16]:
d3 = d3.assign(check=d3.Platform.apply(lambda x:
                                   'Y' if ('PC' in x) & ('X360' in x) else 'N'))

In [17]:
d3 = d3[d3.check == 'Y']

In [18]:
d3_pc = d[(d.Name.isin(d3.index)) & (d.Platform == 'PC')].Global_sales
d3_x360 = d[(d.Name.isin(d3.index)) & (d.Platform == 'X360')].Global_sales

In [19]:
from scipy.stats import ttest_rel

t_val, p_val = ttest_rel(d3_pc, d3_x360)

print(f"{abs(t_val):.3f}")

7.772


### [4]

In [20]:
d4 = pd.get_dummies(d, columns=['Platform'], drop_first=True)
d4 = d4.assign(target=d4.Genre.apply(lambda x: 1 if x == 'Sports' else 0))

d4_train = d4[d4.XGRP == 'A']
d4_test = d4[d4.XGRP == 'B']

d4_X = list(d4.columns[4:8]) + list(d4.columns[11:-1])

In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', random_state=1234, solver='newton-cg',
                           C=100000).fit(d4_train[d4_X], d4_train.target)

pred = model.predict_proba(d4_test[d4_X])
pred = pd.DataFrame(pred, columns=['neg', 'pos'])

In [22]:
pred = pred.assign(odds=pred.pos / (1 - pred.pos))
pred = pred.assign(esti=pred.odds.apply(lambda x: 1 if x >= 0.12 else 0))

fact = pd.DataFrame(d4_test.target).reset_index()
pred = pd.merge(pred, fact, left_index=True, right_index=True)

pred = pred.assign(accu=(pred.esti == pred.target) * 1)

In [23]:
accuracy = pred.accu.sum() / pred.accu.count()

In [24]:
print(f"{accuracy:.3f}")

0.383
