# 모델링 개요


## 0.환경준비

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

## 1.데이터 로딩

In [None]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/advertising.csv'
data = pd.read_csv(path)

data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


* TV, Radio, Newspaper 광고비용 지출에 따른 판매액 데이터
* 광고비용을 어떻게 지출하느냐에 따라 매출액은 어떻게 달라지는지

## 2.데이터 전처리

In [None]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

## 3.학습

* 필요한 함수들 불러오기

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

* 모델 선언하기

In [None]:
model = LinearRegression()

* 학습

In [None]:
model.fit(x_train, y_train)

## 4.예측
* 학습의 결과 모델이 만들어 집니다.
* 모델이 얼마나 정확한지 검증해 봅시다.

In [None]:
pred = model.predict(x_test)

In [None]:
pred

## 5.평가

In [None]:
mean_absolute_error(y_test, pred)

# 실습1 : Diamond Price 예측
* 위 코드를 보면서 그대로 따라해보기

## 1.데이터 로딩

In [None]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/diamonds.csv'
data = pd.read_csv(path)
data = data.sample(2000, random_state=20)
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
20244,1.52,Fair,H,VS2,62.1,66.0,8674,7.33,7.13,4.5
45013,0.54,Premium,G,VS2,62.0,58.0,1637,5.21,5.18,3.22
53351,0.7,Premium,E,VS2,62.0,60.0,2657,5.7,5.59,3.5
47212,0.59,Ideal,E,VS2,62.2,58.0,1838,5.35,5.39,3.34
1460,0.86,Premium,F,SI2,61.8,58.0,2980,6.12,6.15,3.79


## 2.데이터 전처리

* 분할1 : x, y

* 가변수화

In [None]:
target = 'price'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [None]:
x.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
20244,1.52,Fair,H,VS2,62.1,66.0,7.33,7.13,4.5
45013,0.54,Premium,G,VS2,62.0,58.0,5.21,5.18,3.22
53351,0.7,Premium,E,VS2,62.0,60.0,5.7,5.59,3.5
47212,0.59,Ideal,E,VS2,62.2,58.0,5.35,5.39,3.34
1460,0.86,Premium,F,SI2,61.8,58.0,6.12,6.15,3.79


In [None]:
y.head()

Unnamed: 0,price
20244,8674
45013,1637
53351,2657
47212,1838
1460,2980


In [None]:
col_cat = ['cut','color','clarity']
x = pd.get_dummies(x, columns = col_cat, drop_first = True)

In [None]:
x.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
20244,1.52,62.1,66.0,7.33,7.13,4.5,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
45013,0.54,62.0,58.0,5.21,5.18,3.22,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
53351,0.7,62.0,60.0,5.7,5.59,3.5,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
47212,0.59,62.2,58.0,5.35,5.39,3.34,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
1460,0.86,61.8,58.0,6.12,6.15,3.79,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

## 3.학습

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train, y_train)

## 4.예측


In [None]:
pred = model.predict(x_test)

In [None]:
pred

array([ 2.42709832e+02,  1.78737881e+03,  6.09707981e+02,  2.21223117e+02,
       -6.03327941e+02,  2.76450443e+03,  2.24346756e+03,  9.94323576e+03,
        5.20350619e+02,  6.23078087e+03,  2.62521647e+03,  5.92636667e+03,
        1.21462853e+03,  4.92720318e+02,  1.44933026e+03,  5.29426324e+03,
        1.23199369e+04,  1.25063081e+04,  2.31725065e+03,  2.71741872e+03,
       -6.39187531e+02,  1.42827571e+04,  1.18000410e+04,  2.09094851e+03,
       -3.51271335e+01, -5.22135720e+02,  1.75316896e+03, -3.46416166e+01,
        4.33514997e+03,  1.12372463e+04,  4.37224623e+03,  3.88610462e+03,
        5.21563185e+02,  4.73623815e+03,  5.84948978e+03,  2.78585917e+03,
        7.95812594e+03,  2.42043324e+03,  1.13873045e+03,  6.12121082e+03,
        8.02300816e+03,  1.82953306e+03,  9.23828284e+01,  2.05561009e+03,
        1.79445111e+03,  4.59011760e+03,  7.58232466e+02,  4.71559149e+03,
        5.46373502e+03,  1.43158104e+04,  1.77720028e+03,  6.40162002e+03,
        1.09207304e+03,  

## 5.평가

In [None]:
mean_absolute_error(y_test, pred)

745.8526652014851

# 실습2 : 보스턴시 타운별 집값 예측
* 위 코드를 보면서 그대로 따라해보기

## 1.데이터 로딩

In [None]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/boston.csv'
cols = ['crim','indus','chas','nox','rm', 'dis', 'tax','ptratio','lstat','medv']
data = pd.read_csv(path, usecols = cols)
data.head()

Unnamed: 0,crim,indus,chas,nox,rm,dis,tax,ptratio,lstat,medv
0,0.00632,2.31,0,0.538,6.575,4.09,296,15.3,4.98,24.0
1,0.02731,7.07,0,0.469,6.421,4.9671,242,17.8,9.14,21.6
2,0.02729,7.07,0,0.469,7.185,4.9671,242,17.8,4.03,34.7
3,0.03237,2.18,0,0.458,6.998,6.0622,222,18.7,2.94,33.4
4,0.06905,2.18,0,0.458,7.147,6.0622,222,18.7,5.33,36.2


## 2.데이터 전처리

In [None]:
target = 'medv'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

## 3.학습

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train, y_train)

## 4.예측


In [None]:
pred = model.predict(x_test)

## 5.평가

In [None]:
mean_absolute_error(y_test, pred)

3.7513758831716473