In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [76]:

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"


columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
df = pd.read_csv(url, sep='\s+', names=columns, na_values='?')
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [80]:
# 결측치 제거
data=df.dropna()

In [81]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [83]:
data = pd.get_dummies(data, columns=['car name'])
data

KeyError: "None of [Index(['car name'], dtype='object')] are in the [columns]"

In [84]:
X=data.drop('mpg',axis=1)
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name_amc ambassador brougham,car name_amc ambassador dpl,car name_amc ambassador sst,...,car name_volvo 145e (sw),car name_volvo 244dl,car name_volvo 245,car name_volvo 264gl,car name_volvo diesel,car name_vw dasher (diesel),car name_vw pickup,car name_vw rabbit,car name_vw rabbit c (diesel),car name_vw rabbit custom
0,8,307.0,130.0,3504.0,12.0,70,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,8,350.0,165.0,3693.0,11.5,70,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,8,318.0,150.0,3436.0,11.0,70,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,8,304.0,150.0,3433.0,12.0,70,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,8,302.0,140.0,3449.0,10.5,70,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790.0,15.6,82,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
394,4,97.0,52.0,2130.0,24.6,82,2,False,False,False,...,False,False,False,False,False,False,True,False,False,False
395,4,135.0,84.0,2295.0,11.6,82,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
396,4,120.0,79.0,2625.0,18.6,82,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [85]:
X.dtypes

cylinders                          int64
displacement                     float64
horsepower                       float64
weight                           float64
acceleration                     float64
                                  ...   
car name_vw dasher (diesel)         bool
car name_vw pickup                  bool
car name_vw rabbit                  bool
car name_vw rabbit c (diesel)       bool
car name_vw rabbit custom           bool
Length: 308, dtype: object

In [88]:
y=data['mpg']
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 392, dtype: float64

In [90]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  shuffle=True,
                                                  random_state=12)
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [94]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train_scaled, y_train)
ly_preds = model.predict(X_test_scaled)

def mse(actual, predicted):
    sum_square_error = sum((a - p) ** 2 for a, p in zip(actual, predicted))
    mean_square_error = sum_square_error / len(actual)
    return mean_square_error

print('평균 제곱 오차', mse(ly_preds, y_test))

평균 제곱 오차 7.531539103616018e+27


In [95]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train_scaled, y_train)

from sklearn.metrics import mean_squared_error

dy_preds = model.predict(X_test_scaled)
print('평균 제곱 오차', mean_squared_error(dy_preds, y_test))

평균 제곱 오차 14.800886075949366


In [96]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)

from sklearn.metrics import mean_squared_error

ry_preds = model.predict(X_test_scaled)
print('평균 제곱 오차', mean_squared_error(ry_preds, y_test))

평균 제곱 오차 12.028137278481015
