## Import modules

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, preprocessing, neighbors
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
# import matplotlib.pyplot as plt

## Load data 

In [2]:
# need to prepare taipei_A & taipei_B csv 
df_a = pd.read_csv('data/all_A_taipei_A.csv')
df_b = pd.read_csv('data/all_A_taipei_B.csv')

  df_a = pd.read_csv('data/all_A_taipei_A.csv')


## Data processing

In [3]:
# combine two dataframe
df_concat = pd.concat([df_a, df_b], join="inner")
df_concat.index += 1

In [4]:
# select specific columns use fancy index
df_fi = df_concat[["鄉鎮市區", "交易標的", "土地移轉總面積平方公尺", "交易年月日", "移轉層次", "建物型態", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "總價元"]]
df_fi.head(10)

Unnamed: 0,鄉鎮市區,交易標的,土地移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,總價元
1,文山區,房地(土地+建物),4.07,1050418,五層,套房(1房1廳1衛),1,1,1,5750000
2,文山區,房地(土地+建物),9.54,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,22600000
3,文山區,房地(土地+建物),11.53,1050331,二層,套房(1房1廳1衛),1,0,1,6000000
4,文山區,房地(土地+建物)+車位,52.84,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,29200000
5,文山區,房地(土地+建物),15.69,1050502,三層,華廈(10層含以下有電梯),3,2,2,8000000
6,文山區,房地(土地+建物)+車位,99.65,1050411,四層，五層,華廈(10層含以下有電梯),4,2,2,35000000
7,萬華區,房地(土地+建物)+車位,9.32,1050419,六層,華廈(10層含以下有電梯),2,2,1,12700000
8,萬華區,房地(土地+建物),19.23,1050314,九層,住宅大樓(11層含以上有電梯),2,1,1,12200000
9,萬華區,車位,0.15,1050422,一層,其他,0,0,0,1380000
10,萬華區,房地(土地+建物),14.04,1050408,一層，騎樓,套房(1房1廳1衛),1,1,1,7400000


### Delete useless columns (garage & land)

In [5]:
# delete garage and land rows
useless_columns = df_fi[(df_fi["交易標的"] == "車位") | (df_fi["交易標的"] == "土地")].index
df_main = df_fi.drop(useless_columns)
df_main

Unnamed: 0,鄉鎮市區,交易標的,土地移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,總價元
1,文山區,房地(土地+建物),4.07,1050418,五層,套房(1房1廳1衛),1,1,1,5750000
2,文山區,房地(土地+建物),9.54,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,22600000
3,文山區,房地(土地+建物),11.53,1050331,二層,套房(1房1廳1衛),1,0,1,6000000
4,文山區,房地(土地+建物)+車位,52.84,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,29200000
5,文山區,房地(土地+建物),15.69,1050502,三層,華廈(10層含以下有電梯),3,2,2,8000000
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,32.10,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,35600000
14235,文山區,房地(土地+建物)+車位,38.44,1100801,十七層,住宅大樓(11層含以上有電梯),3,2,3,45700000
14236,南港區,房地(土地+建物)+車位,11.20,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,16750000
14237,南港區,房地(土地+建物)+車位,17.15,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,25620000


In [6]:
df_main["移轉層次"] == None
df_main = df_main[df_main['移轉層次'].notna()]
df_main

Unnamed: 0,鄉鎮市區,交易標的,土地移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,總價元
1,文山區,房地(土地+建物),4.07,1050418,五層,套房(1房1廳1衛),1,1,1,5750000
2,文山區,房地(土地+建物),9.54,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,22600000
3,文山區,房地(土地+建物),11.53,1050331,二層,套房(1房1廳1衛),1,0,1,6000000
4,文山區,房地(土地+建物)+車位,52.84,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,29200000
5,文山區,房地(土地+建物),15.69,1050502,三層,華廈(10層含以下有電梯),3,2,2,8000000
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,32.10,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,35600000
14235,文山區,房地(土地+建物)+車位,38.44,1100801,十七層,住宅大樓(11層含以上有電梯),3,2,3,45700000
14236,南港區,房地(土地+建物)+車位,11.20,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,16750000
14237,南港區,房地(土地+建物)+車位,17.15,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,25620000


## Categorical features

In [7]:
# types of columns
df_main.dtypes

鄉鎮市區            object
交易標的            object
土地移轉總面積平方公尺    float64
交易年月日            int64
移轉層次            object
建物型態            object
建物現況格局-房         int64
建物現況格局-廳         int64
建物現況格局-衛         int64
總價元              int64
dtype: object

### Finding Unique Values

In [8]:
# Checking Categorical columns
cat = df_main.select_dtypes(include='O').keys()
cat

Index(['鄉鎮市區', '交易標的', '移轉層次', '建物型態'], dtype='object')

In [9]:
o_type_columns = df_main[['鄉鎮市區', '交易標的', '建物型態', '移轉層次']]
o_type_columns.count()

鄉鎮市區    220351
交易標的    220351
建物型態    220351
移轉層次    220351
dtype: int64

In [10]:
# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

鄉鎮市區 : 12
交易標的 : 3
建物型態 : 12
移轉層次 : 638


### Delete useless floor

In [11]:
# delete useless floor
useless_floor = df_main[df_main["移轉層次"].str.len() > 6].index
df_main = df_main.drop(useless_floor)
df_main

Unnamed: 0,鄉鎮市區,交易標的,土地移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,總價元
1,文山區,房地(土地+建物),4.07,1050418,五層,套房(1房1廳1衛),1,1,1,5750000
2,文山區,房地(土地+建物),9.54,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,22600000
3,文山區,房地(土地+建物),11.53,1050331,二層,套房(1房1廳1衛),1,0,1,6000000
4,文山區,房地(土地+建物)+車位,52.84,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,29200000
5,文山區,房地(土地+建物),15.69,1050502,三層,華廈(10層含以下有電梯),3,2,2,8000000
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,32.10,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,35600000
14235,文山區,房地(土地+建物)+車位,38.44,1100801,十七層,住宅大樓(11層含以上有電梯),3,2,3,45700000
14236,南港區,房地(土地+建物)+車位,11.20,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,16750000
14237,南港區,房地(土地+建物)+車位,17.15,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,25620000


In [14]:
# modify floor value
df_main["移轉層次"] = df_main["移轉層次"].str[:2]

# select count value > 500
df_main = df_main.groupby("移轉層次").filter(lambda grp: len(grp) > 500)
# floor_counts = df_test.groupby("移轉層次")["移轉層次"].transform(len)
# mas = floor_counts < 500

print(df_main["移轉層次"].value_counts().head())
df_main

四層    29196
三層    28295
二層    25935
五層    22109
一層    18736
Name: 移轉層次, dtype: int64


Unnamed: 0,鄉鎮市區,交易標的,土地移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,總價元
1,文山區,房地(土地+建物),4.07,1050418,五層,套房(1房1廳1衛),1,1,1,5750000
2,文山區,房地(土地+建物),9.54,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,22600000
3,文山區,房地(土地+建物),11.53,1050331,二層,套房(1房1廳1衛),1,0,1,6000000
4,文山區,房地(土地+建物)+車位,52.84,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,29200000
5,文山區,房地(土地+建物),15.69,1050502,三層,華廈(10層含以下有電梯),3,2,2,8000000
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,32.10,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,35600000
14235,文山區,房地(土地+建物)+車位,38.44,1100801,十七,住宅大樓(11層含以上有電梯),3,2,3,45700000
14236,南港區,房地(土地+建物)+車位,11.20,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,16750000
14237,南港區,房地(土地+建物)+車位,17.15,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,25620000


### Checking Unique Values

In [15]:
o_type_columns = df_main[['鄉鎮市區', '交易標的', '建物型態', '移轉層次']]
o_type_columns.count()

鄉鎮市區    215447
交易標的    215447
建物型態    215447
移轉層次    215447
dtype: int64

In [16]:
# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

鄉鎮市區 : 12
交易標的 : 3
建物型態 : 12
移轉層次 : 22


In [17]:
pd.Categorical?

[0;31mInit signature:[0m
[0mpd[0m[0;34m.[0m[0mCategorical[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mvalues[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategories[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mordered[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'Dtype | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfastpath[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Represent a categorical variable in classic R / S-plus fashion.

`Categoricals` can only take on only a limited, and usually fixed, number
of possible values (`categories`). In contrast to statistical categorical
variables, a `Categorical` might have an order, but nume

### Categorical encoding

In [42]:
new_floor = {"移轉層次": 
             {"地下": -1, 
              "全" : 0,
              "一層" : 1, 
              "二層" : 2, 
              "三層" : 3, 
              "四層" : 4, 
              "五層" : 5, 
              "六層" : 6, 
              "七層" : 7, 
              "八層" : 8, 
              "九層" : 9, 
              "十層" : 10, 
              "十一" : 11, 
              "十二" : 12, 
              "十三" : 13, 
              "十四" : 14, 
              "十五" : 15, 
              "十六" : 16, 
              "十七" : 17, 
              "十八" : 18, 
              "十九" : 19, 
              "二十" : 20
             }
            }

In [26]:
# transform categorical features
new_df = df_main
new_df["鄉鎮市區"] = pd.Categorical(new_df["鄉鎮市區"]).codes
new_df["交易標的"] = pd.Categorical(new_df["交易標的"]).codes
new_df["建物型態"] = pd.Categorical(new_df["建物型態"]).codes
new_df = new_df.replace(new_floor)
new_df

Unnamed: 0,鄉鎮市區,交易標的,土地移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,總價元
1,9,1,4.07,1050418,五層,4,1,1,1,5750000
2,9,1,9.54,1050327,七層,0,2,2,1,22600000
3,9,1,11.53,1050331,二層,4,1,0,1,6000000
4,9,2,52.84,1050327,九層,0,3,2,2,29200000
5,9,1,15.69,1050502,三層,8,3,2,2,8000000
...,...,...,...,...,...,...,...,...,...,...
14234,9,2,32.10,1100801,五層,0,3,2,2,35600000
14235,9,2,38.44,1100801,十七,0,3,2,3,45700000
14236,5,2,11.20,1101113,五層,0,1,0,1,16750000
14237,5,2,17.15,1101117,三層,0,2,1,1,25620000


## Supervised learning

In [None]:
new_df.columns

In [48]:
# load csv file
df = new_df
# df[0] = pd.Categorical(df[0]).codes
# df[8] = df[8].apply(lambda x: 0 if x> 8 else 1)


X = new_df.drop(['總價元'], axis=1)
y = df["總價元"]

### Linear Regression

In [68]:
# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))     

Coefficients: [-1864040.3617868   7552636.82361336 36713374.39095541   301725.58456425
  5944069.08941728 -1509813.73072436  -802396.55200514 -5097617.63217554
  4032146.95089259]

Mean squared error: 3025086979674106.0
R2 score: 0.6119830266840013


### Decision Tree

In [None]:
# kernel will die
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))