In [54]:
import pandas as pd
import numpy as np
import math

In [55]:
url_pop = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/R_pop_stat.csv'
data_pop = pd.read_csv(url_pop, encoding='utf-8')

### [0]

In [56]:
base = data_pop.dropna(how='any', subset=['AGE', 'INCOME', 'XGRP'])
base = base[(base['AGE'] >= 20) & (base['AGE'] <= 50)]
base = pd.get_dummies(base, columns=['SEX', 'SCHL', 'WORK'], drop_first=True)

### [1]

In [57]:
mean = base['INCOME'].mean()
std = base['INCOME'].std()

ul = mean + 3 * std
ll = mean - 3 * std

base = base.assign(outlier=base['INCOME'].apply(lambda x:
                                               1 if (x > ul) | (x < ll) else 0))

In [58]:
d1 = base[base['outlier'] == 1]
print(f"{d1['INCOME'].max() - d1['INCOME'].min()}")

3325000.0


### [2]

In [59]:
base = base[base['outlier'] == 0]

In [60]:
d2 = base[base['INCOME'] >= base['INCOME'].quantile(0.75)]

In [61]:
result = d2[['AGE', 'INCOME']].corr(method='spearman')
print(f"{math.floor(result['AGE']['INCOME'] * 1000) / 1000:.3f}")

0.115


### [3]

In [93]:
base = base.assign(high_income=base['INCOME'].apply(lambda x:
                                                    'Y' if x >= 25000 else 'N'))

In [94]:
d3 = pd.merge(base, data_pop[['WORK']], how='left', left_index=True,
              right_index=True)

In [95]:
pivot = d3.pivot_table(index='high_income', columns='WORK', aggfunc='size',
                       fill_value=0)

In [96]:
from scipy.stats import chi2_contingency

chi2, p_val, dof, expected = chi2_contingency(pivot)
print(f"{p_val:.5f}")

0.00044


P-value가 0.05보다 작으므로 귀무가설을 기각한다.
귀무가설은 변수간 관계가 없다이므로
직업과 소득수준은 관계가 있다고 할 수 있다.

### [4]

In [124]:
X_var = list(base.columns[0:1]) + list(base.columns[3:12])

train_X = base[base['XGRP'] <= 7][X_var]
train_y = base[base['XGRP'] <= 7]['INCOME'].apply(lambda x: math.log10(x))

In [125]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(train_X, train_y)

In [130]:
test_X = pd.DataFrame([[38, 0, 0, 0, 0, 0, 1, 0, 1, 0]], columns=X_var)

result = model.predict(test_X)[0]
print(f"{round(10 ** result, -2):.0f}")

47400


### [5]

In [132]:
test_X = base[base['XGRP'] > 7][X_var]
test_y = base[base['XGRP'] > 7]['INCOME'].apply(lambda x: math.log10(x))

In [134]:
result = model.predict(test_X)

In [139]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(test_y, result))

print(f"{rmse:.2f}")

0.29
