In [None]:
# 광고 데이터에서 Sales를 가장 잘 예측하는 feature를 확인

In [1]:
import numpy as np
import pandas as pd

In [3]:
# 1. 데이터 로드
df = pd.read_csv("./data/advertising.csv")
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [4]:
# 2. 데이터 전처리 : featrue, target
features = df.iloc[:, :-1]
target = df.iloc[:, -1] # target value = sales

In [None]:
# 3. 모델 학습

In [5]:
from sklearn.linear_model import LinearRegression

In [30]:
features.columns

Index(['TV', 'radio', 'newspaper'], dtype='object')

In [31]:
features.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [32]:
target

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [14]:
model_1 = LinearRegression()
model_1.fit(features[["TV"]], target)

model_2 = LinearRegression()
model_2.fit(features[["radio"]], target)

model_3 = LinearRegression()
model_3.fit(features[["newspaper"]], target)

In [15]:
models = {}
for column in features.columns:
    models[column] = LinearRegression().fit(features[[column]], target)

In [16]:
# list comprehension -> dict comprehension에 참고!
datas = [data ** 2 for data in range(10) if data % 2]
datas

[1, 9, 25, 49, 81]

In [9]:
# dict comprehension
models = {
    column: LinearRegression().fit(features[[column]], target) 
    for column in features.columns
}

In [33]:
models

{'TV': LinearRegression(),
 'radio': LinearRegression(),
 'newspaper': LinearRegression()}

In [None]:
# 4. 모델 성능 평가 : MAE

In [10]:
from sklearn.metrics import mean_absolute_error

In [11]:
for column in features.columns:
    pred = models[column].predict(features[[column]]) 
    mae = mean_absolute_error(pred, target) 
    print(column, np.round(mae, 2))
    
# 경향성을 보는 것에 집중

TV 2.55
radio 3.32
newspaper 4.15


In [None]:
# 5. 각 feature의 결정 계수 출력, 모델의 성능과 비교

In [13]:
df.corr()

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.054809,0.056648,0.782224
radio,0.054809,1.0,0.354104,0.576223
newspaper,0.056648,0.354104,1.0,0.228299
sales,0.782224,0.576223,0.228299,1.0


In [12]:
df.corr() ** 2

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.003004,0.003209,0.611875
radio,0.003004,1.0,0.125389,0.332032
newspaper,0.003209,0.125389,1.0,0.05212
sales,0.611875,0.332032,0.05212,1.0


In [19]:
# 3가지 feature 한번에 LinearRegression 해보자
from sklearn.linear_model import LinearRegression
model_all = LinearRegression()
model_all.fit(features, target) # train data

In [35]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [38]:
pred = model_all.predict(features) # test data
mae = mean_absolute_error(pred, target)

pred = model_all.predict(features) # test data
mse = mean_squared_error(pred, target)

print('mean_absolute_error =',np.round(mae,2))
print('mean_squared_error =',np.round(mse,2))

# 새로운 데이터가 모델에 적용되지 않는다는 가정하에 추세선을 보기 위해서 
# train data, test data를 나누지 않고 fit, predict를 진행한 것!

mean_absolute_error = 1.25
mean_squared_error = 2.78
