In [4]:
#先導入資料處理會用到的模組
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

# 可視化模組
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機器學習模組
import sklearn

# 表示到小數第三位
%precision 3

'%.3f'

In [34]:
import requests, zipfile
import io

# 取得汽車data
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

# 將取得的資料作為DataFrame物件讀取
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)

# 在資料的行裡設定標籤
auto.columns =['symboling','normalized-losses','make','fuel-type' ,'aspiration','num-of-doors',
                            'body-style','drive-wheels','engine-location','wheel-base','length','width','height',
                            'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore',
                            'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

In [35]:
print('汽車資料的形式:{}'.format(auto.shape))

汽車資料的形式:(205, 26)


In [36]:
#顯示最開始的5列
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [37]:
# 計算各個行(欄位)裡有多少個“？”
auto = auto[['price', 'engine-size', 'width']]
auto.isin(['?']).sum()

price          4
engine-size    0
width          0
dtype: int64

In [38]:
# 將?取代為NaN，刪除有NaN的列
auto = auto.replace('?', np.nan).dropna()
print('汽車資料的形式:{}'.format(auto.shape))

汽車資料的形式:(201, 3)


In [39]:
print('資料型別的確認（型別轉換前）\n{}\n'.format(auto.dtypes))

資料型別的確認（型別轉換前）
price           object
engine-size      int64
width          float64
dtype: object



In [41]:
#資料型別轉換
auto = auto.assign(price=pd.to_numeric(auto.price))
# auto = auto.assign(horsepower=pd.to_numeric(auto.horsepower))
print('資料型別的確認（型別轉換後）\n{}'.format(auto.dtypes))

資料型別的確認（型別轉換後）
price            int64
engine-size      int64
width          float64
dtype: object


In [42]:
#觀察相關性
auto.corr()

Unnamed: 0,price,engine-size,width
price,1.0,0.872335,0.751265
engine-size,0.872335,1.0,0.729436
width,0.751265,0.729436,1.0


In [31]:
# 為了資料分割(訓練資料與測試資料)的匯入
from sklearn.model_selection import train_test_split

# 為了多元線性迴歸模型建構的導入
from sklearn.linear_model import LinearRegression

# 指定目標變數為price、其他為解釋變數
X = auto.drop('price', axis=1)
y = auto['price']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# 多元線性迴歸的初始化學習
model = LinearRegression()
model.fit(X_train, y_train)

# 顯示決定係數
print('決定係數(train):{:3f}'.format(model.score(X_train, y_train)))
print('決定係數(test):{:3f}'.format(model.score(X_test, y_test)))
# 顯示迴歸係數與截距
print('\n迴歸係數\n{}'.format(pd.Series(model.coef_, index=X.columns)))
print('截距: {:.3f}'.format(model.intercept_))

決定係數(train):0.733358
決定係數(test):0.737069

迴歸係數
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
截距: -128409.046


In [45]:
# s1061443_李杰穎
# 計算各個行(欄位)裡有多少個“？”
auto = auto[['price', 'engine-size', 'width']]
auto.isin(['?']).sum()

price          0
engine-size    0
width          0
dtype: int64

In [46]:
# s1061443_李杰穎
# 將?取代為NaN，刪除有NaN的列
auto = auto.replace('?', np.nan).dropna()
print('汽車資料的形式:{}'.format(auto.shape))

汽車資料的形式:(201, 3)


In [47]:
# s1061443_李杰穎
#資料型別轉換
auto = auto.assign(price=pd.to_numeric(auto.price))
print('資料型別的確認（型別轉換後）\n{}'.format(auto.dtypes))

資料型別的確認（型別轉換後）
price            int64
engine-size      int64
width          float64
dtype: object


In [48]:
# s1061443_李杰穎
# 為了資料分割(訓練資料與測試資料)的匯入
from sklearn.model_selection import train_test_split

# 為了多元線性迴歸模型建構的導入
from sklearn.linear_model import LinearRegression

# 指定目標變數為price、其他為解釋變數
X = auto.drop('price', axis=1)
y = auto['price']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# 多元線性迴歸的初始化學習
model = LinearRegression()
model.fit(X_train, y_train)

# 顯示決定係數
print('決定係數(train):{:3f}'.format(model.score(X_train, y_train)))
print('決定係數(test):{:3f}'.format(model.score(X_test, y_test)))
# 顯示迴歸係數與截距
print('\n迴歸係數\n{}'.format(pd.Series(model.coef_, index=X.columns)))
print('截距: {:.3f}'.format(model.intercept_))

決定係數(train):0.783189
決定係數(test):0.778292

迴歸係數
engine-size     109.526787
width          1261.735518
dtype: float64
截距: -84060.643
