## Import modules

In [1]:
import numpy as np
import pandas as pd
import os, sys

## Load data 

In [2]:
# set the path
path = "dataset_all/"
dirs = os.listdir(path)

# load all city data & storage city code
df_list = []
city_codes = []
for file in dirs:
    df_city = pd.read_csv(path + file, engine='python')
    city_code = file[4]
    df_list.append(df_city)
    city_codes.append(city_code)

## Data processing

In [3]:
# add column "city code" 
for each_df, code in zip(df_list, city_codes):
    each_df["城市代碼"] = code


# concat all dataframes
df = pd.concat(df_list, join="inner")
df.index += 1
    

In [4]:
# slice datetime
date_string = df['交易年月日'].astype('str')
df['交易年'] = date_string.str[:3].astype('int64')
df['交易月'] = date_string.str[3:5].astype('int64')

# modify floor value
df['樓層'] = df.loc[:, ['移轉層次']]['移轉層次'].str[:2]  # to avoid chain assignment(SettingWithCopyWarning )


# define function for classifying
def price_transform(df):
    value = df['每坪價格']
    if value < 10:
        res = 0
    elif value < 20:
        res = 1
    elif value < 30:
        res = 2
    elif value < 40:
        res = 3
    elif value < 50:
        res = 4
    elif value < 60:
        res = 5
    elif value < 70:
        res = 6
    elif value < 80:
        res = 7
    elif value < 90:
        res = 8
    elif value < 100:
        res = 9
    else:
        res = 10
    return res

#create new column using the function above
df['每坪價格'] = (df['單價元平方公尺'] * 3.3058) / 10000
df['價格分類'] = df.apply(price_transform, axis=1)

df

Unnamed: 0.1,Unnamed: 0,鄉鎮市區,交易標的,土地位置建物門牌,土地移轉總面積平方公尺,都市土地使用分區,非都市土地使用分區,非都市土地使用編定,交易年月日,交易筆棟數,...,車位類別,車位總價元,備註,編號,城市代碼,交易年,交易月,樓層,每坪價格,價格分類
1,1,文山區,房地(土地+建物),臺北市文山區景後街９５號五樓之１,4.07,商,,,1050418,土地1建物1車位0,...,,0,,RPTNMLNJNHLFFAA47CA,A,105,4,五層,53.454455,5
2,2,文山區,房地(土地+建物),臺北市文山區景文街76號7樓,9.54,商,,,1050327,土地1建物1車位0,...,,0,,RPUOMLPJNHLFFAA87CA,A,105,3,七層,64.696159,6
3,3,文山區,房地(土地+建物),臺北市文山區木柵路四段33巷9號2樓之3,11.53,住,,,1050331,土地1建物1車位0,...,,0,,RPVOMLPJNHLFFAA97CA,A,105,3,二層,48.365837,4
4,4,文山區,房地(土地+建物)+車位,臺北市文山區興隆路三段181巷72號9樓,52.84,住,,,1050327,土地3建物1車位1,...,坡道平面,0,,RPVNMLNKNHLFFAA77CA,A,105,3,九層,44.061355,4
5,5,文山區,房地(土地+建物),臺北市文山區忠順街二段５８號三樓,15.69,住,,,1050502,土地1建物1車位0,...,,0,親友、員工或其他特殊關係間之交易。,RPPNMLOKNHLFFAA27CA,A,105,5,三層,31.721796,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,478,北竿鄉,土地,里段157-1地號,30.46,,,,1101029,土地1建物0車位0,...,,0,,RPPNMLTLIIGGFAZ07DA,Z,110,10,,0.135538,0
479,479,北竿鄉,土地,橋仔段401-1地號,51.91,,,,1101103,土地1建物0車位0,...,,0,,RPPNMLNJJIGGFAZ98DA,Z,110,11,,1.239675,0
480,480,南竿鄉,土地,津沙段353地號,20.94,,,,1101103,土地1建物0車位0,...,,0,,RPQNMLNJJIGGFAZ09DA,Z,110,11,,1.388436,0
481,481,東引鄉,土地,東引西段610-1地號,44.05,,,,1101108,土地3建物0車位0,...,,0,,RPQNMLTJJIGGFAZ07DA,Z,110,11,,5.628455,0


### Write new csv

In [6]:
# output new csv
df.to_csv('dataset.csv', index = False)

In [None]:
# types of columns
print(df.dtypes)
print("=======================================")

print(df.select_dtypes(include='O').keys())
print("=======================================")

o_type_columns = df[['城市代碼', '鄉鎮市區', '交易標的', '建物型態', '樓層']]
print(o_type_columns.count())
print("=======================================")

# unique values in each columns
for i in o_type_columns.columns:
    #prinfting unique values
    print(i ,':', len(o_type_columns[i].unique()))