## Import modules

In [1]:
import numpy as np
import pandas as pd
import sys

In [2]:
# scikit-learn
from sklearn import datasets, linear_model, preprocessing, neighbors
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# import matplotlib.pyplot as plt

## Load data 

In [3]:
# need to prepare taipei_A & taipei_B csv 
df_a = pd.read_csv('dataset//all_A_台北市_A.csv', engine='python')
df_b = pd.read_csv('dataset//all_A_台北市_B.csv', engine='python')

## Data processing

In [4]:
# concat two dataframe
df_concat = pd.concat([df_a, df_b], join="inner")
df_concat.index += 1

In [5]:
# slice datetime
df_concat['交易年月'] = df_concat['交易年月日'].astype('str').str[:5].astype('int64')

In [28]:
# select specific columns with fancy index
df_fi = df_concat[["鄉鎮市區", "交易標的", "建物移轉總面積平方公尺", "交易年月日", "移轉層次", "建物型態", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "單價元平方公尺"]]
df_fi.head(10)

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺
1,文山區,房地(土地+建物),35.56,1050418,五層,套房(1房1廳1衛),1,1,1,161699.0
2,文山區,房地(土地+建物),115.48,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0
3,文山區,房地(土地+建物),41.01,1050331,二層,套房(1房1廳1衛),1,0,1,146306.0
4,文山區,房地(土地+建物)+車位,219.08,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0
5,文山區,房地(土地+建物),83.37,1050502,三層,華廈(10層含以下有電梯),3,2,2,95958.0
6,文山區,房地(土地+建物)+車位,228.23,1050411,四層，五層,華廈(10層含以下有電梯),4,2,2,153354.0
7,萬華區,房地(土地+建物)+車位,101.03,1050419,六層,華廈(10層含以下有電梯),2,2,1,125693.0
8,萬華區,房地(土地+建物),80.31,1050314,九層,住宅大樓(11層含以上有電梯),2,1,1,151911.0
9,萬華區,車位,1.41,1050422,一層,其他,0,0,0,
10,萬華區,房地(土地+建物),36.4,1050408,一層，騎樓,套房(1房1廳1衛),1,1,1,203297.0


In [29]:
# remove garage & land & NaN columns
con1 = ~(df_fi["交易標的"] == "車位")
con2 = ~(df_fi["交易標的"] == "土地")
con3 = df_fi['移轉層次'].notna()
con4 = df_fi['單價元平方公尺'].notna()
df_main = df_fi[con1 & con2 & con3 & con4]
df_main

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺
1,文山區,房地(土地+建物),35.56,1050418,五層,套房(1房1廳1衛),1,1,1,161699.0
2,文山區,房地(土地+建物),115.48,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0
3,文山區,房地(土地+建物),41.01,1050331,二層,套房(1房1廳1衛),1,0,1,146306.0
4,文山區,房地(土地+建物)+車位,219.08,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0
5,文山區,房地(土地+建物),83.37,1050502,三層,華廈(10層含以下有電梯),3,2,2,95958.0
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,193.16,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,202351.0
14235,文山區,房地(土地+建物)+車位,225.70,1100801,十七層,住宅大樓(11層含以上有電梯),3,2,3,218117.0
14236,南港區,房地(土地+建物)+車位,76.80,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,271073.0
14237,南港區,房地(土地+建物)+車位,108.84,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,282860.0


In [14]:
# delete garage and land rows
useless_columns = df_fi[(df_fi["交易標的"] == "車位") | (df_fi["交易標的"] == "土地")].index
df_main = df_fi.drop(useless_columns)
# delete Nan-rows
df_main = df_main[df_main['移轉層次'].notna()]
df_main = df_main[df_main['單價元平方公尺'].notna()]
df_main

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺
1,文山區,房地(土地+建物),35.56,1050418,五層,套房(1房1廳1衛),1,1,1,161699.0
2,文山區,房地(土地+建物),115.48,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0
3,文山區,房地(土地+建物),41.01,1050331,二層,套房(1房1廳1衛),1,0,1,146306.0
4,文山區,房地(土地+建物)+車位,219.08,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0
5,文山區,房地(土地+建物),83.37,1050502,三層,華廈(10層含以下有電梯),3,2,2,95958.0
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,193.16,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,202351.0
14235,文山區,房地(土地+建物)+車位,225.70,1100801,十七層,住宅大樓(11層含以上有電梯),3,2,3,218117.0
14236,南港區,房地(土地+建物)+車位,76.80,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,271073.0
14237,南港區,房地(土地+建物)+車位,108.84,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,282860.0


In [40]:
# delete useless floor
useless_floor = df_main["移轉層次"].str.len() > 6
df_main = df_main[~useless_floor]
# df_main = df_main.drop(useless_floor)


# modify floor value
df_main["移轉層次"] = df_main["移轉層次"].str[:2]

# select count value > 500
df_main = df_main.groupby("移轉層次").filter(lambda grp: len(grp) > 500)
# floor_counts = df_test.groupby("移轉層次")["移轉層次"].transform(len)
# mas = floor_counts < 500

# print(df_main["移轉層次"].value_counts().head())

df_main

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺
1,文山區,房地(土地+建物),35.56,1050418,五層,套房(1房1廳1衛),1,1,1,161699.0
2,文山區,房地(土地+建物),115.48,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0
3,文山區,房地(土地+建物),41.01,1050331,二層,套房(1房1廳1衛),1,0,1,146306.0
4,文山區,房地(土地+建物)+車位,219.08,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0
5,文山區,房地(土地+建物),83.37,1050502,三層,華廈(10層含以下有電梯),3,2,2,95958.0
...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,193.16,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,202351.0
14235,文山區,房地(土地+建物)+車位,225.70,1100801,十七,住宅大樓(11層含以上有電梯),3,2,3,218117.0
14236,南港區,房地(土地+建物)+車位,76.80,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,271073.0
14237,南港區,房地(土地+建物)+車位,108.84,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,282860.0


### add new Label

In [58]:
type()

True

In [43]:
df_main['單價元平方公尺'].mean()*3.3058

647108.5771817899

In [62]:
#create new column 'Good' using the function above
df_main['每坪價格'] = (df_main['單價元平方公尺'] * 3.3058) / 10000

#view DataFrame 
df_main

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺,每坪價格
1,文山區,房地(土地+建物),35.56,1050418,五層,套房(1房1廳1衛),1,1,1,161699.0,53.454455
2,文山區,房地(土地+建物),115.48,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0,64.696159
3,文山區,房地(土地+建物),41.01,1050331,二層,套房(1房1廳1衛),1,0,1,146306.0,48.365837
4,文山區,房地(土地+建物)+車位,219.08,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0,44.061355
5,文山區,房地(土地+建物),83.37,1050502,三層,華廈(10層含以下有電梯),3,2,2,95958.0,31.721796
...,...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,193.16,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,202351.0,66.893194
14235,文山區,房地(土地+建物)+車位,225.70,1100801,十七,住宅大樓(11層含以上有電梯),3,2,3,218117.0,72.105118
14236,南港區,房地(土地+建物)+車位,76.80,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,271073.0,89.611312
14237,南港區,房地(土地+建物)+車位,108.84,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,282860.0,93.507859


In [64]:
#define function for classifying players based on points
def transform_price(df):
    value = df['每坪價格']
    if value < 10:
        res = 0
    elif value < 20:
        res = 1
    elif value < 30:
        res = 2
    elif value < 40:
        res = 3
    elif value < 50:
        res = 4
    elif value < 60:
        res = 5
    elif value < 70:
        res = 6
    elif value < 80:
        res = 7
    elif value < 90:
        res = 8
    elif value < 100:
        res = 9
    else:
        res = 10
    return res

df_main['價格分類'] = df_main.apply(transform_price, axis=1)
df_main

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,單價元平方公尺,每坪價格,價格分類
1,文山區,房地(土地+建物),35.56,1050418,五層,套房(1房1廳1衛),1,1,1,161699.0,53.454455,5
2,文山區,房地(土地+建物),115.48,1050327,七層,住宅大樓(11層含以上有電梯),2,2,1,195705.0,64.696159,6
3,文山區,房地(土地+建物),41.01,1050331,二層,套房(1房1廳1衛),1,0,1,146306.0,48.365837,4
4,文山區,房地(土地+建物)+車位,219.08,1050327,九層,住宅大樓(11層含以上有電梯),3,2,2,133285.0,44.061355,4
5,文山區,房地(土地+建物),83.37,1050502,三層,華廈(10層含以下有電梯),3,2,2,95958.0,31.721796,3
...,...,...,...,...,...,...,...,...,...,...,...,...
14234,文山區,房地(土地+建物)+車位,193.16,1100801,五層,住宅大樓(11層含以上有電梯),3,2,2,202351.0,66.893194,6
14235,文山區,房地(土地+建物)+車位,225.70,1100801,十七,住宅大樓(11層含以上有電梯),3,2,3,218117.0,72.105118,7
14236,南港區,房地(土地+建物)+車位,76.80,1101113,五層,住宅大樓(11層含以上有電梯),1,0,1,271073.0,89.611312,8
14237,南港區,房地(土地+建物)+車位,108.84,1101117,三層,住宅大樓(11層含以上有電梯),2,1,1,282860.0,93.507859,9


In [67]:
df_main = df_main.drop(['單價元平方公尺', '每坪價格'], axis=1)

### Checking categorical features

In [32]:
# types of columns
print(df_main.dtypes)
print("====================")

# finding categorical columns
print(df_main.select_dtypes(include='O').keys())
print("====================")

# sum of specific columns
o_type_columns = df_main[['鄉鎮市區', '交易標的', '建物型態', '移轉層次']]
print(o_type_columns.count())
print("====================")

# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

鄉鎮市區            object
交易標的            object
建物移轉總面積平方公尺    float64
交易年月日            int64
移轉層次            object
建物型態            object
建物現況格局-房         int64
建物現況格局-廳         int64
建物現況格局-衛         int64
單價元平方公尺        float64
dtype: object
Index(['鄉鎮市區', '交易標的', '移轉層次', '建物型態'], dtype='object')
鄉鎮市區    222335
交易標的    222335
建物型態    222335
移轉層次    222335
dtype: int64
鄉鎮市區 : 12
交易標的 : 3
建物型態 : 9
移轉層次 : 636


### Categorical encoding

In [68]:
new_floor = {"移轉層次": 
             {"地下": -1, 
              "全" : 0,
              "一層" : 1, 
              "二層" : 2, 
              "三層" : 3, 
              "四層" : 4, 
              "五層" : 5, 
              "六層" : 6, 
              "七層" : 7, 
              "八層" : 8, 
              "九層" : 9, 
              "十層" : 10, 
              "十一" : 11, 
              "十二" : 12, 
              "十三" : 13, 
              "十四" : 14, 
              "十五" : 15, 
              "十六" : 16, 
              "十七" : 17, 
              "十八" : 18, 
              "十九" : 19, 
              "二十" : 20
             }
            }

In [69]:
# Encoding categorical features
new_df = df_main
new_df["鄉鎮市區"] = pd.Categorical(new_df["鄉鎮市區"]).codes
new_df["交易標的"] = pd.Categorical(new_df["交易標的"]).codes
new_df["建物型態"] = pd.Categorical(new_df["建物型態"]).codes
new_df = new_df.replace(new_floor)
new_df

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,價格分類
1,9,1,35.56,1050418,5,3,1,1,1,5
2,9,1,115.48,1050327,7,0,2,2,1,6
3,9,1,41.01,1050331,2,3,1,0,1,4
4,9,2,219.08,1050327,9,0,3,2,2,4
5,9,1,83.37,1050502,3,6,3,2,2,3
...,...,...,...,...,...,...,...,...,...,...
14234,9,2,193.16,1100801,5,0,3,2,2,6
14235,9,2,225.70,1100801,17,0,3,2,3,7
14236,5,2,76.80,1101113,5,0,1,0,1,8
14237,5,2,108.84,1101117,3,0,2,1,1,9


In [70]:
new_df.corr()

Unnamed: 0,鄉鎮市區,交易標的,建物移轉總面積平方公尺,交易年月日,移轉層次,建物型態,建物現況格局-房,建物現況格局-廳,建物現況格局-衛,價格分類
鄉鎮市區,1.0,-0.031539,-0.016388,0.019183,0.000983,-0.057385,0.090245,0.099327,0.045833,-0.119175
交易標的,-0.031539,1.0,0.145373,-0.009438,0.315216,-0.078837,0.056533,0.092752,0.083537,0.163276
建物移轉總面積平方公尺,-0.016388,0.145373,1.0,-0.014552,0.044649,0.017114,0.078225,0.016106,0.083621,0.028039
交易年月日,0.019183,-0.009438,-0.014552,1.0,0.027294,-0.043745,0.027558,0.054167,0.019728,0.102395
移轉層次,0.000983,0.315216,0.044649,0.027294,1.0,-0.291592,-0.058372,-0.020273,-0.017398,0.211793
建物型態,-0.057385,-0.078837,0.017114,-0.043745,-0.291592,1.0,-0.066734,-0.101465,-0.035753,-0.000695
建物現況格局-房,0.090245,0.056533,0.078225,0.027558,-0.058372,-0.066734,1.0,0.61253,0.747789,-0.122617
建物現況格局-廳,0.099327,0.092752,0.016106,0.054167,-0.020273,-0.101465,0.61253,1.0,0.496509,-0.078678
建物現況格局-衛,0.045833,0.083537,0.083621,0.019728,-0.017398,-0.035753,0.747789,0.496509,1.0,-0.024361
價格分類,-0.119175,0.163276,0.028039,0.102395,0.211793,-0.000695,-0.122617,-0.078678,-0.024361,1.0


In [None]:
# write csv for spark
new_df.to_csv('taipei_output.csv')

## Supervised learning

In [None]:
new_df.columns

In [71]:
X = new_df.drop(['價格分類'], axis=1)
y = new_df["價格分類"]

### Linear Regression

In [87]:
# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(X, y, test_size=0.2)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# make confusion matrix
# con_matrix = confusion_matrix(data_y_test, data_y_pred)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))
# print('confusion matrix: {}'.format(con_matrix))

Coefficients: [-0.23386439  0.25089944  0.02038476  0.23995669  0.40592593  0.11122738
 -0.43300643 -0.04936964  0.28531013]

Mean squared error: 4.6844224152465825
R2 score: 0.10170605434510405


### Logistic regression

In [None]:
# split data into training data and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# build model 
model=LogisticRegression()
model.fit(X_train,y_train)

X_test_nor = scaler.transform(X_test)
y_pred = model.predict(X_test_nor)

accuracy = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)

# evaluate model
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)


model = RandomForestClassifier(max_depth=7, random_state=0)
model.fit(X_train, y_train)


X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))

### Support Vector Machine

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = SVC(kernel='rbf')
model.fit(X_train, y_train) 

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))


### Decision Tree

In [None]:
# kernel will die
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))