In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_corolla_1 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/corolla_1.csv'
url_corolla_2 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/corolla_2.csv'

data1 = pd.read_csv(url_corolla_1)
data2 = pd.read_csv(url_corolla_2)

In [3]:
data1.head()

Unnamed: 0,ID,Price,Age,KM,HP,CC,Weight
0,1,13500,23,46986,90,2000,1165
1,2,13750,23,72937,90,2000,1165
2,3,13950,24,41711,90,2000,1165
3,4,14950,26,48000,90,2000,1165
4,5,13750,30,38500,90,2000,1170


In [4]:
data2.head()

Unnamed: 0,ID,FuelType,TransmissionType,Doors
0,43.0,Petrol,Manual,2doors
1,44.0,Diesel,Manual,2doors
2,45.0,Diesel,Manual,5doors
3,46.0,Diesel,Manual,5doors
4,47.0,Diesel,Manual,5doors


### [0] 전처리

In [5]:
base = pd.merge(data1, data2, on='ID')
base = base.dropna()

### [1]

In [6]:
d1_auto = base[base.TransmissionType == 'Automatic']['Price']
d1_manual = base[base.TransmissionType == 'Manual']['Price']

In [7]:
from scipy.stats import ttest_ind

t_val, p_val = ttest_ind(d1_auto, d1_manual, equal_var=True)

print(f"{math.floor(p_val * 1000) / 1000}")

0.095


### [2]

In [8]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

model = ols('Price ~ C(Doors)', base).fit()
result = anova_lm(model)

result

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Doors),3.0,276964100.0,92321380.0,6.652761,0.000189
Residual,991.0,13752260000.0,13877150.0,,


In [9]:
f_val = result.iloc[0]['F']
p_val = result.iloc[0]['PR(>F)']

print(f"f-value : {math.floor(f_val * 100) / 100}, p-value : {math.floor(p_val * 10000) / 10000}")

f-value : 6.65, p-value : 0.0001


### [3]

In [10]:
corr = base.corr(method='pearson').loc['Price', 'Age':]
idx = [i for i, corr in enumerate(corr) if abs(corr) >= 0.3]
X_var1 = corr[idx].index

In [11]:
d3 = pd.get_dummies(base, columns=base.columns[-3:], drop_first=True)

In [12]:
X_var2 = d3.columns[7:]
X_var = list(X_var1) + list(X_var2)

train_X = d3[X_var]
train_y = d3['Price']

In [13]:
from statsmodels.api import add_constant, OLS

train_X = add_constant(train_X)
model = OLS(train_y, train_X)
ols_result = model.fit()

p_val = ols_result.pvalues
adj_r2 = ols_result.rsquared_adj

n_of_effective_var = len(p_val[[i for i, val in enumerate(p_val) if val < 0.05]][1:])

print(f"{n_of_effective_var}, {math.floor(adj_r2 * 1000) / 1000:.3f}")

5, 0.860


### [4]

In [66]:
test_X = pd.DataFrame([[1, 5, 50000, 143, 1170, 1, 0, 0, 0, 1, 0]], columns=train_X.columns)

In [67]:
pred = ols_result.predict(test_X)

In [69]:
print(f"{math.floor(pred)}")

21053
