## Import modules

In [1]:
import numpy as np
import pandas as pd
import os, sys

In [37]:
from sklearn import datasets, linear_model, preprocessing, neighbors
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC

# import matplotlib.pyplot as plt

## Load data 

In [26]:
df = pd.read_csv('dataset_drawing.csv', engine='python')
df.index += 1

In [28]:
new_df = df.copy()
new_df = new_df[["城市代碼", "鄉鎮市區", "建物型態", "交易年", "交易月", "建物移轉總面積平方公尺", "樓層", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "價格分類"]]
new_df

Unnamed: 0,城市代碼,鄉鎮市區,建物型態,交易年,交易月,建物移轉總面積平方公尺,樓層,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,價格分類
1,A,文山區,住宅大樓(11層含以上有電梯),105,3,115.48,七層,2,2,1,6
2,A,文山區,套房(1房1廳1衛),105,3,41.01,二層,1,0,1,4
3,A,文山區,住宅大樓(11層含以上有電梯),105,3,219.08,九層,3,2,2,4
4,A,文山區,華廈(10層含以下有電梯),105,4,228.23,四層,4,2,2,5
5,A,萬華區,住宅大樓(11層含以上有電梯),105,3,80.31,九層,2,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
1691175,Z,南竿鄉,華廈(10層含以下有電梯),109,9,94.60,五層,2,1,2,1
1691176,Z,南竿鄉,華廈(10層含以下有電梯),109,9,84.02,二層,2,1,2,0
1691177,Z,南竿鄉,華廈(10層含以下有電梯),109,9,84.02,三層,2,1,2,1
1691178,Z,南竿鄉,華廈(10層含以下有電梯),109,9,84.02,四層,2,1,2,1


### Feature engineering

In [31]:
new_floor = {"樓層":
             {"地下" : 0,
              "一層" : 1, 
              "二層" : 2, 
              "三層" : 3, 
              "四層" : 4, 
              "五層" : 5, 
              "六層" : 6, 
              "七層" : 7, 
              "八層" : 8, 
              "九層" : 9, 
              "十層" : 10, 
              "十一" : 11, 
              "十二" : 12, 
              "十三" : 13, 
              "十四" : 14, 
              "十五" : 15, 
              "十六" : 16, 
              "十七" : 17, 
              "十八" : 18, 
              "十九" : 19, 
              "二十" : 20,
              "三十" : 30,
              "全" : 99
             }
            }

In [32]:
# Encoding categorical features
new_df["城市代碼"] = pd.Categorical(new_df["城市代碼"]).codes
new_df["鄉鎮市區"] = pd.Categorical(new_df["鄉鎮市區"]).codes
new_df["建物型態"] = pd.Categorical(new_df["建物型態"]).codes
new_df = new_df.replace(new_floor)
new_df

Unnamed: 0,城市代碼,鄉鎮市區,建物型態,交易年,交易月,建物移轉總面積平方公尺,樓層,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,價格分類
1,0,153,0,105,3,115.48,7,2,2,1,6
2,0,153,3,105,3,41.01,2,1,0,1,4
3,0,153,0,105,3,219.08,9,3,2,2,4
4,0,153,6,105,4,228.23,4,4,2,2,5
5,0,289,0,105,3,80.31,9,2,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
1691175,21,72,6,109,9,94.60,5,2,1,2,1
1691176,21,72,6,109,9,84.02,2,2,1,2,0
1691177,21,72,6,109,9,84.02,3,2,1,2,1
1691178,21,72,6,109,9,84.02,4,2,1,2,1


## Supervised learning

In [33]:
new_df.columns

Index(['城市代碼', '鄉鎮市區', '建物型態', '交易年', '交易月', '建物移轉總面積平方公尺', '樓層', '建物現況格局-房',
       '建物現況格局-廳', '建物現況格局-衛', '價格分類'],
      dtype='object')

In [43]:
new_df

Unnamed: 0,城市代碼,鄉鎮市區,建物型態,交易年,交易月,建物移轉總面積平方公尺,樓層,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,價格分類
1,0,153,0,105,3,115.48,7,2,2,1,6
2,0,153,3,105,3,41.01,2,1,0,1,4
3,0,153,0,105,3,219.08,9,3,2,2,4
4,0,153,6,105,4,228.23,4,4,2,2,5
5,0,289,0,105,3,80.31,9,2,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
1691175,21,72,6,109,9,94.60,5,2,1,2,1
1691176,21,72,6,109,9,84.02,2,2,1,2,0
1691177,21,72,6,109,9,84.02,3,2,1,2,1
1691178,21,72,6,109,9,84.02,4,2,1,2,1


In [39]:
# load csv file
# df[0] = pd.Categorical(df[0]).codes
# df[8] = df[8].apply(lambda x: 0 if x> 8 else 1)


X = new_df.drop(['價格分類'], axis=1)
y = new_df["價格分類"]

### Linear Regression

In [None]:
# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(X, y, test_size=0.2)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))     

### Logistic Regression

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model=LogisticRegression()
model.fit(X_train,y_train)

X_test_nor = scaler.transform(X_test)
y_pred = model.predict(X_test_nor)
accuracy = accuracy_score(y_test, y_pred)

print(model.coef_)
#print prediction result
print(y_pred)
#print accuracy 
print(accuracy)

### K-Nearest Neighbor 

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = neighbors.KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)


X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

number of correct sample: 192714
accuracy: 0.5697619413663833
confusion matrix: [[21125 17007  1750   400   157    40    22    12     4     1    16]
 [15326 93511 17658  2316   540   186    94    31    25     8    16]
 [ 3308 22498 47069  7035  1330   390   150    50    20     9    19]
 [ 1064  4980 10043 15275  3233   831   256    64    31    13    29]
 [  497  1752  3074  4772  7472  1686   543   178    64    16    39]
 [  187   763  1146  1789  2503  3424   955   354   130    39    64]
 [  112   405   486   826  1263  1356  1931   482   192    49    94]
 [   62   169   211   360   629   787   745  1080   232    82   127]
 [   37   107   152   169   305   406   401   373   620   113   159]
 [   15    62    68    89   151   209   216   196   178   240   174]
 [   74   149   136   125   215   257   244   249   185   126   967]]


### Naive Bayes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
#scaler = preprocessing.MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = GaussianNB()
#model = MultinomialNB()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))

### SVM

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# we can change kernel to rbf, poly, linear
model = SVC(kernel='rbf', C=1.3)
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))

### Decision Tree

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))

number of correct sample: 108215
accuracy: 0.6398786646010478
con_matrix: [[12125  6994   909   210    62    21    10     8     8     7    11]
 [ 6939 47289  8276  1347   357   143    76    31    23    11    26]
 [  922  8150 27162  3647   803   279   102    44    31    12    23]
 [  196  1240  3683 10100  1901   576   219    86    22    16    20]
 [   79   340   769  1916  5044  1085   400   156    67    30    32]
 [   44   145   242   540  1065  2577   665   255   109    54    45]
 [   20    76   126   222   399   640  1530   378   173    71    59]
 [   12    43    48    72   173   255   381   820   181    76   112]
 [    5    13    29    29    58    96   146   203   537   110   111]
 [    5     7    11    21    29    52    67    68   116   257   138]
 [   11    21    37    27    36    59    60    93   133   116   774]]
