## Import modules

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, preprocessing, neighbors
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import os, sys

# import matplotlib.pyplot as plt

## Load data 

In [None]:
# set the path
path = "dataset/"
dirs = os.listdir(path)

# load all city data & storage city code
df_list = []
city_codes = []
for file in dirs:
    df_city = pd.read_csv(path + file)
    city_code = file[4]
    df_list.append(df_city)
    city_codes.append(city_code)

## Data processing

In [None]:
# add column "city code" 
for df, code in zip(df_list, city_codes):
    df["城市"] = code

In [None]:
# concat all dataframes
df = pd.concat(df_list, join="inner")
df.index += 1

In [None]:
df = df[["城市", "鄉鎮市區", "交易標的", "土地移轉總面積平方公尺", "交易年月日", "移轉層次", "建物型態", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "總價元"]]
df

### Delete useless columns (garage & land)

In [None]:
# only query important columns
df = df.loc[~df["交易標的"].isin(["車位","土地"]) & df['移轉層次'].notna()]

# select column without garage and land
# ~df["交易標的"].isin(["車位","土地"])
# without NaN
# df['移轉層次'].notna()

## Categorical features

In [None]:
# types of columns
df.dtypes

### Finding Unique Values

In [None]:
# Checking Categorical columns
cat = df.select_dtypes(include='O').keys()
cat

In [None]:
o_type_columns = df[['城市', '鄉鎮市區', '交易標的', '建物型態', '移轉層次']]
o_type_columns.count()

In [None]:
# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

### Delete useless floor

In [None]:
# delete useless floor
useless_floor = df[df["移轉層次"].str.len() > 6].index
df = df.drop(useless_floor)
df

In [None]:
# modify floor value
df["移轉層次"] = df["移轉層次"].str[:2]

# select count value > 1000
df = df.groupby("移轉層次").filter(lambda grp: len(grp) > 1000)
# floor_counts = df_test.groupby("移轉層次")["移轉層次"].transform(len)
# mas = floor_counts < 1000

print(df["移轉層次"].value_counts().head())
df

### Checking Unique Values

In [None]:
o_type_columns = df[['城市', '鄉鎮市區', '交易標的', '建物型態', '移轉層次']]
o_type_columns.count()

In [None]:
# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))

In [None]:
df["移轉層次"].value_counts().sort_values(ascending=False)

In [None]:
pd.Categorical?

### Categorical encoding

In [None]:
new_floor = {"移轉層次": 
             {"地下": -1, 
              "全" : 0,
              "一層" : 1, 
              "二層" : 2, 
              "三層" : 3, 
              "四層" : 4, 
              "五層" : 5, 
              "六層" : 6, 
              "七層" : 7, 
              "八層" : 8, 
              "九層" : 9, 
              "十層" : 10, 
              "十一" : 11, 
              "十二" : 12, 
              "十三" : 13, 
              "十四" : 14, 
              "十五" : 15, 
              "十六" : 16, 
              "十七" : 17, 
              "十八" : 18, 
              "十九" : 19, 
              "二十" : 20,
              "三十" : 30
             }
            }

In [None]:
# transform categorical features
new_df = df
new_df["城市"] = pd.Categorical(new_df["城市"]).codes
new_df["鄉鎮市區"] = pd.Categorical(new_df["鄉鎮市區"]).codes
new_df["交易標的"] = pd.Categorical(new_df["交易標的"]).codes
new_df["建物型態"] = pd.Categorical(new_df["建物型態"]).codes
new_df = new_df.replace(new_floor)
new_df

## Supervised learning

In [None]:
new_df.columns

In [None]:
# load csv file
df = new_df
# df[0] = pd.Categorical(df[0]).codes
# df[8] = df[8].apply(lambda x: 0 if x> 8 else 1)


X = new_df.drop(['總價元'], axis=1)
y = df["總價元"]

### Linear Regression

In [None]:
# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))     

### Decision Tree

In [None]:
# kernel will die
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))