## Import modules

In [1]:
import numpy as np
import pandas as pd
import os, sys

In [20]:
from sklearn import datasets, linear_model, preprocessing, neighbors
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC

# import matplotlib.pyplot as plt

## Load data 

In [2]:
# set the path
path = "dataset/"
dirs = os.listdir(path)

# load all city data & storage city code
df_list = []
city_codes = []
for file in dirs:
    df_city = pd.read_csv(path + file, engine='python')
    city_code = file[4]
    df_list.append(df_city)
    city_codes.append(city_code)

CPU times: total: 49.6 s
Wall time: 52.1 s


## Data processing

In [3]:
# add column "city code" 
for each_df, code in zip(df_list, city_codes):
    each_df["城市代碼"] = code

In [4]:
# concat all dataframes
df_origin = pd.concat(df_list, join="inner")
df_origin.index += 1

In [5]:
df = df_origin

#define function for classifying
def price_transform(df):
    value = df['每坪價格']
    if value < 10:
        res = 0
    elif value < 20:
        res = 1
    elif value < 30:
        res = 2
    elif value < 40:
        res = 3
    elif value < 50:
        res = 4
    elif value < 60:
        res = 5
    elif value < 70:
        res = 6
    elif value < 80:
        res = 7
    elif value < 90:
        res = 8
    elif value < 100:
        res = 9
    else:
        res = 10
    return res



# slice datetime
df['交易年月'] = df['交易年月日'].astype('str').str[:5].astype('int64')

# modify floor value
df['樓層'] = df.loc[:, ['移轉層次']]['移轉層次'].str[:2]  # to avoid chain assignment(SettingWithCopyWarning )
df = df.groupby("移轉層次").filter(lambda grp: len(grp) > 1000) # select count value > 1000

#create new column using the function above
df['每坪價格'] = (df['單價元平方公尺'] * 3.3058) / 10000
df['價格分類'] = df.apply(price_transform, axis=1)

df = df[["城市代碼", "鄉鎮市區", "交易標的", "建物移轉總面積平方公尺", "交易年月", "樓層", "建物型態", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "單價元平方公尺", "每坪價格", "價格分類"]]
df

Unnamed: 0,城市代碼,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月,樓層,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺,每坪價格,價格分類
1,A,文山區,房地(土地+建物),35.56,10504,五層,套房(1房1廳1衛),1,1,1,161699.0,53.454455,5
2,A,文山區,房地(土地+建物),115.48,10503,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0,64.696159,6
3,A,文山區,房地(土地+建物),41.01,10503,二層,套房(1房1廳1衛),1,0,1,146306.0,48.365837,4
4,A,文山區,房地(土地+建物)+車位,219.08,10503,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0,44.061355,4
5,A,文山區,房地(土地+建物),83.37,10505,三層,華廈(10層含以下有電梯),3,2,2,95958.0,31.721796,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62934,H,中壢區,房地(土地+建物)+車位,195.72,11011,五層,住宅大樓(11層含以上有電梯),3,2,2,103442.0,34.195856,3
62935,H,中壢區,房地(土地+建物)+車位,195.82,11011,十一,住宅大樓(11層含以上有電梯),3,2,2,106798.0,35.305283,3
62936,H,中壢區,房地(土地+建物)+車位,155.83,11011,二層,華廈(10層含以下有電梯),3,2,2,66164.0,21.872495,2
62937,H,中壢區,房地(土地+建物)+車位,117.09,11011,八層,華廈(10層含以下有電梯),2,2,1,69246.0,22.891343,2


In [6]:
df = df.loc[~df["交易標的"].isin(["車位","土地"]) & df['樓層'].notna() & df['單價元平方公尺'].notna()]
df

Unnamed: 0,城市代碼,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月,樓層,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺,每坪價格,價格分類
1,A,文山區,房地(土地+建物),35.56,10504,五層,套房(1房1廳1衛),1,1,1,161699.0,53.454455,5
2,A,文山區,房地(土地+建物),115.48,10503,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0,64.696159,6
3,A,文山區,房地(土地+建物),41.01,10503,二層,套房(1房1廳1衛),1,0,1,146306.0,48.365837,4
4,A,文山區,房地(土地+建物)+車位,219.08,10503,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0,44.061355,4
5,A,文山區,房地(土地+建物),83.37,10505,三層,華廈(10層含以下有電梯),3,2,2,95958.0,31.721796,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62934,H,中壢區,房地(土地+建物)+車位,195.72,11011,五層,住宅大樓(11層含以上有電梯),3,2,2,103442.0,34.195856,3
62935,H,中壢區,房地(土地+建物)+車位,195.82,11011,十一,住宅大樓(11層含以上有電梯),3,2,2,106798.0,35.305283,3
62936,H,中壢區,房地(土地+建物)+車位,155.83,11011,二層,華廈(10層含以下有電梯),3,2,2,66164.0,21.872495,2
62937,H,中壢區,房地(土地+建物)+車位,117.09,11011,八層,華廈(10層含以下有電梯),2,2,1,69246.0,22.891343,2


In [None]:
# query needed columns
df = df.loc[~df["交易標的"].isin(["車位","土地"]) & df['移轉層次'].notna() & df['單價元平方公尺'].notna()]
# select column without garage and land
# ~df["交易標的"].isin(["車位","土地"])
# without NaN
# df['移轉層次'].notna()

# modify floor value
df_floor = df.loc[:, ['移轉層次']]  # to avoid chain assignment(SettingWithCopyWarning )
df_floor['移轉層次'] = df_floor['移轉層次'].str[:2]
df[]
# select count value > 1000
df = df.groupby("移轉層次").filter(lambda grp: len(grp) > 1000)
# floor_counts = df_test.groupby("移轉層次")["移轉層次"].transform(len)
# mas = floor_counts < 1000

df


## Categorical features

In [None]:
# types of columns
print(df.dtypes)
print("=======================================")

print(df.select_dtypes(include='O').keys())
print("=======================================")

o_type_columns = df[['城市代碼', '鄉鎮市區', '交易標的', '建物型態', '樓層']]
print(o_type_columns.count())
print("=======================================")

# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

In [None]:
# Checking Categorical columns
cat = df.select_dtypes(include='O').keys()
cat

In [None]:
o_type_columns = df[['城市', '鄉鎮市區', '交易標的', '建物型態', '移轉層次']]
o_type_columns.count()

In [None]:
# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

### Categorical encoding

In [None]:
print(df["樓層"].value_counts().head(50))
print(df["價格分類"].value_counts().head(50))

In [7]:
new_floor = {"樓層": 
             {"地下": -1, 
              "全" : 0,
              "一層" : 1, 
              "二層" : 2, 
              "三層" : 3, 
              "四層" : 4, 
              "五層" : 5, 
              "六層" : 6, 
              "七層" : 7, 
              "八層" : 8, 
              "九層" : 9, 
              "十層" : 10, 
              "十一" : 11, 
              "十二" : 12, 
              "十三" : 13, 
              "十四" : 14, 
              "十五" : 15, 
              "十六" : 16, 
              "十七" : 17, 
              "十八" : 18, 
              "十九" : 19, 
              "二十" : 20,
              # "三十" : 30
             }
            }

In [8]:
# transform categorical features
new_df = df
new_df["城市代碼"] = pd.Categorical(new_df["城市代碼"]).codes
new_df["鄉鎮市區"] = pd.Categorical(new_df["鄉鎮市區"]).codes
new_df["交易標的"] = pd.Categorical(new_df["交易標的"]).codes
new_df["建物型態"] = pd.Categorical(new_df["建物型態"]).codes
new_df = new_df.replace(new_floor)
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["城市代碼"] = pd.Categorical(new_df["城市代碼"]).codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["鄉鎮市區"] = pd.Categorical(new_df["鄉鎮市區"]).codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["交易標的"] = pd.Categorical(new_df["交易標的"]).codes
A value is trying to be set on a copy of a 

Unnamed: 0,城市代碼,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月,樓層,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺,每坪價格,價格分類
1,0,69,1,35.56,10504,5,3,1,1,1,161699.0,53.454455,5
2,0,69,1,115.48,10503,7,0,2,2,1,195705.0,64.696159,6
3,0,69,1,41.01,10503,2,3,1,0,1,146306.0,48.365837,4
4,0,69,2,219.08,10503,9,0,3,2,2,133285.0,44.061355,4
5,0,69,1,83.37,10505,3,6,3,2,2,95958.0,31.721796,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62934,5,8,2,195.72,11011,5,0,3,2,2,103442.0,34.195856,3
62935,5,8,2,195.82,11011,11,0,3,2,2,106798.0,35.305283,3
62936,5,8,2,155.83,11011,2,6,3,2,2,66164.0,21.872495,2
62937,5,8,2,117.09,11011,8,6,2,2,1,69246.0,22.891343,2


## Supervised learning

In [None]:
new_df.columns

In [9]:
new_df = new_df.drop(["單價元平方公尺",  "每坪價格"], axis = 1)

In [10]:
new_df

Unnamed: 0,城市代碼,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月,樓層,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,價格分類
1,0,69,1,35.56,10504,5,3,1,1,1,5
2,0,69,1,115.48,10503,7,0,2,2,1,6
3,0,69,1,41.01,10503,2,3,1,0,1,4
4,0,69,2,219.08,10503,9,0,3,2,2,4
5,0,69,1,83.37,10505,3,6,3,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...
62934,5,8,2,195.72,11011,5,0,3,2,2,3
62935,5,8,2,195.82,11011,11,0,3,2,2,3
62936,5,8,2,155.83,11011,2,6,3,2,2,2
62937,5,8,2,117.09,11011,8,6,2,2,1,2


In [11]:
# load csv file
# df[0] = pd.Categorical(df[0]).codes
# df[8] = df[8].apply(lambda x: 0 if x> 8 else 1)


X = new_df.drop(['價格分類'], axis=1)
y = new_df["價格分類"]

### Linear Regression

In [12]:
# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(X, y, test_size=0.2)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))     

Coefficients: [-0.49877285 -0.25604049  0.01738981  0.0136404   0.04878497  0.16925354
 -0.18833658 -0.0196591  -0.0332216  -0.00161351]

Mean squared error: 2.980453524104724
R2 score: 0.12415035830949273


### Logistic Regression

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model=LogisticRegression()
model.fit(X_train,y_train)

X_test_nor = scaler.transform(X_test)
y_pred = model.predict(X_test_nor)
accuracy = accuracy_score(y_test, y_pred)

print(model.coef_)
#print prediction result
print(y_pred)
#print accuracy 
print(accuracy)

[[ 1.09686033e+00  3.91194503e-01 -6.62877789e-01  2.26319995e-02
  -1.65864569e+00 -2.98081436e-01  1.38860608e-01  1.91805174e+00
  -2.28448068e-02 -3.82754659e-02]
 [ 1.09295775e+00  3.29323307e-01  1.32388844e-01 -2.77168362e-01
  -3.95643380e-01 -1.88547409e-01  1.37418385e-01  2.63307266e+00
   6.48033772e-01  4.88831130e-02]
 [ 1.18490959e+00  3.64701665e-01  3.19490970e-01  2.71505635e-02
   2.15705903e-01 -1.29825665e-01  3.05652093e-02  2.63003682e+00
   6.50673127e-01  4.42867778e-02]
 [ 1.27303141e+00  2.04947277e-01  8.56028720e-02  3.51157271e-02
   2.43549716e-01 -9.93550259e-03 -1.41846557e-01  2.63342284e+00
   2.32867376e-02 -2.32243306e-02]
 [ 1.01894181e+00 -1.15062940e-02  1.25033081e-02 -7.62786607e-03
   2.45453028e-01 -2.17004137e-02 -2.20491638e-01 -1.48570335e+00
   4.32568191e-01 -1.15430718e-02]
 [ 5.53775283e-01 -9.19462429e-02 -3.39485334e-02  2.48263731e-02
   2.35983315e-01  7.06933221e-02 -1.80264018e-01 -2.01580340e+00
   3.14604216e-01 -1.39331232e-02

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### K-Nearest Neighbor 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = neighbors.KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)


X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

number of correct sample: 241438
accuracy: 0.5842575361957608
confusion matrix: [[ 16368  13116   1641    381    129     51     19      4      3      0
       4]
 [ 11336 110217  21947   2805    699    259    112     29     20      3
      26]
 [  2806  26903  65879  10234   1807    557    204     83     24      9
      51]
 [   880   6002  13876  24009   5274   1354    408    122     57     23
      62]
 [   370   2019   3843   7067  12491   2868    775    270     97     39
     103]
 [   178    875   1399   2747   4014   5651   1380    482    173     66
     135]
 [    65    474    646   1068   1710   2166   2757    717    282     67
     218]
 [    30    191    316    438    829   1071   1058   1355    387    134
     243]
 [    24    106    141    194    429    579    575    580    758    153
     285]
 [    20     61     77     99    190    273    306    297    291    303
     264]
 [    58    151    189    191    274    402    403    387    295    177
    1650]]


### Naive Bayes

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
#scaler = preprocessing.MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = GaussianNB()
#model = MultinomialNB()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))

number of correct sample: 40412
accuracy: 0.0977932866936567
con_matrix: [[16755    35   155   195  4831  6760    87  2112    10     0   458]
 [49762    61  1005  3225 47580 30515   692 14092    70     0   868]
 [26187    31   834  4836 44469 23379   414  7952    77     0   535]
 [ 9641    15   402  1366 23669 11662   459  4039    77     0   560]
 [ 3643     9   215   211 13288  5851   378  5316   103     1   845]
 [ 1348     2    80    56  5526  3066   270  5939    89     0   767]
 [  577     2    40    11  2001  1566   232  4971    86     0   703]
 [  283     5    24     5   407   905   119  3714    66     0   571]
 [  168     2    11     4    70   480    41  2479    47     0   477]
 [   93     2     9     7    20   308    38  1340    23     0   353]
 [  350     6    16     4    62   610    32  1900    75     0  1049]]


### SVM

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# we can change kernel to rbf, poly, linear
model = SVC(kernel='rbf', C=1.3)
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))

### Decision Tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))