# Linear Regression 線性回歸
ref: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
- 糖尿病資料集 來源： (點開User Guide 有欄位介紹)
- https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html
1. Samples total：442  (樣本數442)               
2. Dimensionality：10  (維度10)
3. Features：real, -.2 < x < .2  (數值已經過正規化 介於-0.2~0.2間)
4. Targets：integer 25 - 346    (目標：找到介於25~346之間的整數)

In [12]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

#載入資料
diabetes = datasets.load_diabetes()

# 定義 X  (diabetes.data 是前10欄的資料)
diabetes_X = diabetes.data 
# print(diabetes_X.shape)  

# 定義 Y  (diabetes.target 是第11欄的資料)
diabetes_y = diabetes.target
#print(diabetes_y)

# 把 X改成 只取第3欄BMI指數 來畫圖，因為畫圖只能畫出兩個維度(原本是10維度)
# diabetes_X = diabetes.data[:, None, 3] # None 是加一個維度, 3是欄位
# print(diabetes_X.shape)

#分成 訓練資料9成 測試資料1成
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.1, random_state=1)

# 原本應該還要有一步 資料正規化，但因為原本的資料就已經做過，所以此處不用

# 建模
model = linear_model.LinearRegression()  

# 訓練模型 調參數 θ0~θ2
model.fit(diabetes_X_train, diabetes_y_train) 

# 訓練完就可用 model.predict 把 X測試代入 得到預測結果
diabetes_y_pred = model.predict(diabetes_X_test)

# 係數 訓練出來的θ0~θn
print('Coefficients:', model.coef_)

# MSE 平均平方誤差 (越小越好)
print("Mean squared error:",(mean_squared_error(diabetes_y_test, diabetes_y_pred)))

# R平方 (越接近1越好)
print('Variance score:',r2_score(diabetes_y_test, diabetes_y_pred))
                                      
# 畫圖呈現 (搭配第18行)
# plt.scatter(diabetes_X_test, diabetes_y_test,  color='black') #畫散射點
# plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3) #畫線

# plt.xticks(())
# plt.yticks(())
# plt.show()

Coefficients: [ -26.2759041  -231.92697391  566.49859849  303.08631061 -709.86807128
  428.67719627   82.12319677  165.60628133  729.33691844   42.58281733]
Mean squared error: 3319.6925871395583
Variance score: 0.31749491764735915


### 線性回歸 練習
- 房價資料集 來源： (共14個欄位,506筆資料)
- https://www.kaggle.com/c/boston-housing
1. 不是正規的csv檔案
2. 最後一欄是 房價中位數
3. 用前面所有欄未去預測房價中位數 比對最後一欄

In [13]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('./dataset/housing.csv', header=None, delim_whitespace=True)
                                                       #指定是否將空格用作sep，sep預設逗號

# 定義 X，移除索引13那欄 (要比對的資料=特徵)
X = df.drop([13], axis=1) 

# 定義 y，索引13的那欄 (拿來被預測出來的值 比對的)
y = df[13] 

# 分成 9成訓練 1成測試
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) 

# 用 z-score normalization算平均值.標準差
scaler = preprocessing.StandardScaler().fit(X_train) 

# 用平均值和標準差 對數據進行正規化
X_train = scaler.transform(X_train) 

#建模
model = linear_model.LinearRegression()

#訓練  調參數
model.fit(X_train, y_train) 

#[重點]用 x_train的平均值.標準差 再對 x_test做正規化
X_test = scaler.transform(X_test) 

#訓練完就可用 model.predict 把 X測試代入 得到預測結果
y_pred = model.predict(X_test)


print('Coefficients:',model.coef_)
print("Mean squared error:",mean_squared_error(y_test, y_pred))
print('Variance score:',r2_score(y_test, y_pred))


Coefficients: [-1.0687251   1.16543133  0.38751811  0.78964813 -2.01245202  2.52486535
  0.07708128 -3.00698459  2.58839007 -1.92592811 -2.08542166  0.85453073
 -4.08566478]
Mean squared error: 22.362210932709512
Variance score: 0.6540351800657843


# Polynomial Regression 多項式回歸 (變高維度)
* /ˌpɑ liˈnoʊ mi əl/
* ref: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [18]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing

def f(x):
    """ 2x的3次方 - 2x """
    return 2*x*x*x - 2 *x  


# 隨機產生資料集 介在0~10 分成100均等分
X = np.linspace(0, 10, 100)
np.random.shuffle(X) #洗牌
X = X.reshape(-1,1)
y = f(X)
print('shape of X is:',X.shape)

# 只看前5筆資料
print(X[:5])
print('=============')

#-------------重點----------------

poly = PolynomialFeatures(degree=2).fit(X)  #degree=2 把 n設成二次方
X_poly = poly.transform(X) 

# 看轉換後 前5筆，每筆顯示 0次方.1次方.2次方
print(X_poly[:5])

#----------以下跟 Linear Regression一樣--------------

X_train,X_test,y_train,y_test=train_test_split(X_poly,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = linear_model.LinearRegression() 
model.fit(X_train, y_train)  

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)

print('Coefficients:',model.coef_)
print("Mean squared error:",mean_squared_error(y_test, y_pred))
print('Variance score:',r2_score(y_test, y_pred))

shape of X is: (100, 1)
[[2.62626263]
 [6.26262626]
 [9.19191919]
 [5.25252525]
 [6.16161616]]
[[ 1.          2.62626263  6.89725538]
 [ 1.          6.26262626 39.22048771]
 [ 1.          9.19191919 84.49137843]
 [ 1.          5.25252525 27.58902153]
 [ 1.          6.16161616 37.96551372]]
Coefficients: [[   0.         -349.28647657  916.46178128]]
Mean squared error: 1307.6467148653887
Variance score: 0.9951790483339933


### 多項式回歸 練習
- 紅酒資料集 來源： (共12個欄位,4898筆資料)
- https://archive.ics.uci.edu/ml/datasets/wine+quality
1. 最後一欄 是紅酒的品質分數 0~10
2. 前面都是 紅酒的特性

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('./dataset/winequality-red.csv')

X = df.drop('quality',axis=1)
y = df['quality']

poly = PolynomialFeatures(degree=2).fit(X)
X = poly.transform(X)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)

print('Coefficients:',model.coef_)
print("Mean squared error:",mean_squared_error(y_test, y_pred))
print('Variance score:',r2_score(y_test, y_pred))

Coefficients: [-2.20439093e-11 -3.32036477e+01 -3.46762494e+01 -1.98569253e+01
 -1.40652656e+01 -6.85273381e+01 -7.74114997e+01  1.00727869e+02
 -2.50880009e+01 -7.93478035e+01  4.63945174e+01 -6.70038809e+00
 -1.07912073e+00 -4.28103408e-01 -1.80118094e-01 -5.27741073e-01
 -9.30638006e-01 -6.23191815e-01  5.34638897e-01  3.69394013e+01
 -1.63849368e+00  6.25798078e-01  6.86240624e-02 -1.07543978e-01
  3.25708843e-02 -6.49414077e-02  2.15989372e-01 -5.66509424e-02
  3.43575308e-01  3.51401508e+01 -1.05497188e+00 -2.55960941e-01
  8.98178254e-01 -3.09937110e-02  9.34787505e-02  8.98840857e-02
  1.48808583e-01 -3.20881226e-02  2.21970228e+01 -3.13608092e+00
 -3.12173913e-01  1.11046257e+00 -1.59627758e-01 -2.03532456e-01
  3.43111110e-02 -6.51049711e-04  1.79109779e+01 -2.69409190e+00
  1.29726359e-02 -3.65247752e-01  6.44755908e-02 -5.20131174e-02
 -1.18159029e-01  6.92374227e+01 -5.56852862e-01  1.09112468e-01
  5.56498217e-01 -1.63840021e-01 -3.70231215e-02  7.79292790e+01
 -5.4702046

# Logistic Regression   羅吉斯回歸
#### (雖然是回歸，但常用在分類)
ref: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

- 鳶尾花資料集 來源： (User Guide 可以看欄位簡介)
- https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html
1. Classes：3  (分3種花)
2. Samples per class：50  (每一種類50筆)
3. Samples total：150 (共150筆)
4. Dimensionality：4 (維度4，每一筆特徵有4個：花萼長寬.花蕊長寬)
5. 目標：用前面4欄的特性，去看應該是屬於哪一類別的花

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model  import LogisticRegression

iris = datasets.load_iris()

X = iris.data  
y = iris.target #類別 3種

print(X.shape) 
print(y)  # 有三個種類 所以 0 1 2 
                                              
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

# 用z-score normalization 算平均值.標準差
scaler = preprocessing.StandardScaler().fit(X_train) 

# 用平均值和標準差 對數據進行正規化
X_train = scaler.transform(X_train) 

#建模
model=LogisticRegression()

#訓練 調參數
model.fit(X_train,y_train)  

#[重點]用 x_train的平均值.標準差 再對 x_test做正規化
X_test_nor = scaler.transform(X_test)  #nor = normalization 正規化

#訓練完就可用 model.predict 把 X測試代入 得到預測結果
y_pred = model.predict(X_test_nor) 

#算準確度
accuracy = model.score(X_test_nor, y_test)

print(model.coef_) 
print(y_pred) 
print(accuracy) 

(150, 4)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[[-1.0169518   0.93320063 -1.71063824 -1.57995617]
 [ 0.64186897 -0.48170874 -0.2674308  -0.708515  ]
 [ 0.37508284 -0.4514919   1.97806903  2.28847117]]
[0 1 1 0 2 1 2 0 0 2 1 0 2 1 1 0 1 1 0 0 1 1 2 0 2 1 0 0 1 2 1 2 1 2 2 0 1
 0 1 2 2 0 1 2 1]
0.9555555555555556


### 羅吉斯回歸 練習
- Pima印第安人資料集 來源： (共9個欄位,769筆資料)
- https://www.kaggle.com/uciml/pima-indians-diabetes-database/data
1. 最後一欄 是否得糖尿病 (是或否 兩個類別 0 or 1)
2. 前面都是 病人的特徵 懷孕情況(pregnant)、體內胰島素含量(insulin)、BMI值(bmi)、年齡(age)...

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model  import LogisticRegression

pima = pd.read_csv('./dataset/pima-indians-diabetes.csv')

#用'pregnant','insulin','bmi', 'age' 4個變數預測'label'(是否發病)
df=pima[['pregnant', 'insulin', 'bmi', 'age', 'label']]

X=df[['pregnant', 'insulin', 'bmi', 'age']]
y=df['label']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) 

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = LogisticRegression()
model.fit(X_train,y_train)

X_test_nor = scaler.transform(X_test)  #nor = normalization 正規化

y_pred = model.predict(X_test_nor)
accuracy = model.score(X_test_nor, y_test)

print(y_pred)
print(accuracy)
print(model.coef_)
print(model.intercept_ ) # 截距

[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 0 1 1 1 1 0 0
 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1]
0.7056277056277056
[[0.24527177 0.23789336 0.69148497 0.39595607]]
[-0.78491863]
