# README

We perform train test split, some simple preprocess steps, create mapper.

The preprocessed train and test set are saved to folder `data`.

The mapper is saved to folder `model`.

In [None]:
import os
os.chdir('../../seminar_1')
import pandas as pd
from utils.helper_functions import missing_values_table, process_datetime, save_pickle
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('data/kc_house_data.csv')

# Train test split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3, random_state=0)

# Very basic preprocessing steps

In [None]:
missing_values_table(train)

In [None]:
def preprocess_df(df):
    # Convert to datetime df type
    df['date'] = pd.to_datetime(df['date'])

    # Preprocess to feature engineering some datetime features
    df = process_datetime(df, 'date')
    return df

In [None]:
train_preprocessed = preprocess_df(train)
test_preprocessed = preprocess_df(test)

# Mapper

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, StandardScaler

mapper=DataFrameMapper([
    # Numerical Features
    (['bedrooms'], StandardScaler()),
    (['bathrooms'], StandardScaler()),
    (['sqft_living'], StandardScaler()),
    (['sqft_lot'], StandardScaler()),
    (['grade'], StandardScaler()),
    (['sqft_above'], StandardScaler()),
    (['sqft_basement'], StandardScaler()),
    (['yr_built'], StandardScaler()),
    (['yr_renovated'], StandardScaler()),
    (['lat'], StandardScaler()),
    (['long'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_lot15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    
    # Categorical Features
    (['waterfront'], OneHotEncoder(handle_unknown='ignore')),
    (['view'], OneHotEncoder(handle_unknown='ignore')),
    (['condition'], OneHotEncoder(handle_unknown='ignore')),
    (['date_month'], OneHotEncoder(handle_unknown='ignore')),
    (['date_dow'], OneHotEncoder(handle_unknown='ignore')),
    (['date_quarter'], OneHotEncoder(handle_unknown='ignore')),
    (['date_isweeknd'], OneHotEncoder(handle_unknown='ignore')),
    (['date_month_interval'], OneHotEncoder(handle_unknown='ignore')),
], df_out=True)

# Fit mapper
mapper = mapper.fit(train_preprocessed)

In [None]:
# Save everything
save_pickle('model/mapper.pkl', mapper)
train_preprocessed.to_csv('data/train.csv', index=False)
test_preprocessed.to_csv('data/test.csv', index=False)