# Import libraries

In [1]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

# To use datetime
from datetime import datetime

# For clustering location
from sklearn.cluster import KMeans

# Load data

In [2]:
raw_data = pd.read_csv('kc_house_data.csv', 
                       dtype = {'price': int})
data = raw_data.copy()
data.date = pd.to_datetime(raw_data.date)

In [3]:
train_set, test_set = train_test_split(data, test_size = 0.15, random_state = 42)

# Functions

In [4]:
def dummy_date(df):
    # reset index in case needed
    df_dummied = df.reset_index(drop = True)

    # setting datetime split
    split_date = datetime(2014,10,16)
    
    # creating new column
    df_dummied.loc[:, 'recent_sells'] = 0
    
        
    for i in range(len(df_dummied)):
        if df_dummied.loc[i, 'date'] > split_date:
            df_dummied.loc[i, 'recent_sells'] = 1
    
    df_dummied.drop(['date'], axis = 1, inplace = True)
    
    print('Date function was applied:')
    print('--> It dummified old feature "date" and created new feature "recent_sells"\n')
    
    
    return df_dummied

In [5]:
# To remove clear outliers & dummify variable
def clean_bedrooms(df):
    df.reset_index(inplace = True, drop = True)
    bed_clear_outliers = df[(df['bedrooms'] > 10) | (df['bedrooms']  == 0)]
    
    df_cleaned = df.drop(bed_clear_outliers.index)
    df_cleaned.reset_index(inplace = True, drop = True)
    
    df_cleaned.loc[:, 'beds_4more'] = 0
    
    for i in range(len(df_cleaned)):
        if df_cleaned.loc[i, 'bedrooms'] > 3:
            df_cleaned.loc[i, 'beds_4more'] = 1 # Houses with 1 to 3 bedrooms become the reference
    
    df_cleaned.drop(['bedrooms'], axis = 1, inplace = True)
    
    print('Bedroom function was applied:')
    print(f'--> {len(bed_clear_outliers)} outliers were removed')
    print('--> It dummified old feature "bedrooms" and created new feature "beds_4more"\n')
    
    
    return df_cleaned

In [6]:
def clear_basement(df):
    df.reset_index(inplace = True, drop = True)
    
    df['has_basement'] = 0
    
    for i in range(len(df)):
        if df.loc[i, 'sqft_basement'] > 0:
            df.loc[i, 'has_basement'] = 1
    
    df_cleaned = df.drop(['sqft_basement'], axis = 1)
    
    print('Basement function was applied:')
    print(f'--> "sqft_basement" feature was dummified into "has_basement"\n')
    
    return df_cleaned

In [7]:
# Remove all features having less than 10% of positive values

def remove_highly_imbalanced_features(df):
    to_drop = []
    for column in df.columns:
        if len(df[df[column] > 0]) > 0:
            if len(df[df[column] > 0])/len(df) < 0.1:
                to_drop.append(column)
            
    df_removed = df.drop(to_drop, axis = 1)
    
    print('Remove imbalance features function was applied:')
    print(f'--> The following features were dropped: {to_drop}\n')

    return df_removed

In [8]:
class Outliers():
    def __init__(self):
        return None
    
    def find(self, df, att):
        self.df_ = df
        
        q1, q3 = df[att].quantile([0.25,0.75])
        iqr = q3 - q1
        cst = iqr*1.5
        
        # Rule:
        high_out = q3 + cst
        low_out = q1 - cst
        
        # Outliers:
        self.df_remove = df[(df[att] > high_out)|(df[att] < low_out)]
        
        count_out = len(self.df_remove)
        ratio_out = len(self.df_remove)/len(df)

        if count_out > 0:
            print('There are {} outliers found for the attribute {}.\n-->This represent {:.2%} of the entire set\n'.format(count_out, att, ratio_out))
        
      
    def show(self):
        return self.df_remove
                
    def remove(self):
        return self.df_.drop(self.df_remove.index)
    
    def find_remove(self, df, att):
        self.df_ = df
        
        q1, q3 = df[att].quantile([0.25,0.75])
        iqr = q3 - q1
        cst = iqr*1.5
        
        # Rule:
        high_out = q3 + cst
        low_out = q1 - cst
        
        # Outliers:
        self.df_remove = df[(df[att] > high_out)|(df[att] < low_out)]
        
        return df.drop(self.df_remove.index)

In [9]:
outliers = Outliers() 

In [10]:
def remove_all_outliers(df):
    df_org = len(df)
    
    att_to_remove = []
    for attribute in df.columns:
        if len(outliers.find_remove(df, attribute)) < len(df):
            if (attribute != 'lat') and (attribute != 'long'):
                att_to_remove.append(attribute)

    for att in att_to_remove:
        df = outliers.find_remove(df, att)
    
    df_rem = df.reset_index(drop = True)
    
    count_out = df_org - len(df)
    
    print('Remove all outliers function was applied:')
    print(f'--> {count_out} outliers were removed\n')
    
    return df_rem

In [11]:
def cluster_long_lat(df):
    X_lat_long = df[['lat','long']].values
    kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
    y_kmeans = kmeans.fit_predict(X_lat_long)
    
    df['loc_north_west'] = 0
    df['loc_north_east'] = 0
    
    for i in range(len(df)):
        if y_kmeans[i] == 2:
            df.loc[i, 'loc_north_west'] = 1
        elif y_kmeans[i] == 0:
            df.loc[i, 'loc_north_east'] = 1
            
    df_clustered = df.drop(['lat','long', 'zipcode'], axis = 1)
    
    print('Kmeans cluster was applied to long & lat features:')
    print('--> Zipcode, lat, and long features were dropped and two new one-hot encoded features were created\n')
    
    return df_clustered

# Preprocessing pipeline

In [12]:
def preprocessing_pipeline(df, d_date = True, bed = True, loc = True, 
                           basement = True, remove_imba = True, remove_out = True):
    
    function_list = [dummy_date, clean_bedrooms, clear_basement,
                     remove_highly_imbalanced_features, remove_all_outliers, cluster_long_lat]
    
    activated_functions = [d_date, bed, basement, remove_imba, remove_out, loc]
   
    
    i_to_remove = []
    for i in range(len(activated_functions)):
        if activated_functions[i] == False:
            i_to_remove.append(i)
            
    
    for index in sorted(i_to_remove, reverse = True): #delete in reverse order so you don't throw off the subsequent indexes.
        del activated_functions[index] 
        del function_list[index]


    for func in function_list:
        df = func(df)
    
    df1 = df.drop_duplicates('id', keep='last')
    print(f'{len(df)-len(df1)} duplicates were removed')
    
    return df1

# Results

In [13]:
train_set_preprocesed = preprocessing_pipeline(train_set, d_date = True,
                                               bed = True, 
                                               loc = True, 
                                               basement = True, 
                                               remove_imba = False, 
                                               remove_out = False)
train_set_preprocesed.shape

Date function was applied:
--> It dummified old feature "date" and created new feature "recent_sells"

Bedroom function was applied:
--> 10 outliers were removed
--> It dummified old feature "bedrooms" and created new feature "beds_4more"

Basement function was applied:
--> "sqft_basement" feature was dummified into "has_basement"

Kmeans cluster was applied to long & lat features:
--> Zipcode, lat, and long features were dropped and two new one-hot encoded features were created

128 duplicates were removed


(18233, 20)

In [14]:
test_set_preprocesed = preprocessing_pipeline(test_set, d_date = True,
                                               bed = True, 
                                               loc = True, 
                                               basement = True, 
                                               remove_imba = False, 
                                               remove_out = False)
test_set_preprocesed.shape

Date function was applied:
--> It dummified old feature "date" and created new feature "recent_sells"

Bedroom function was applied:
--> 5 outliers were removed
--> It dummified old feature "bedrooms" and created new feature "beds_4more"

Basement function was applied:
--> "sqft_basement" feature was dummified into "has_basement"

Kmeans cluster was applied to long & lat features:
--> Zipcode, lat, and long features were dropped and two new one-hot encoded features were created

4 duplicates were removed


(3233, 20)

# Export data

In [15]:
train_set_preprocesed.to_csv('train_data_preprocessed_with_outliers_and_imba.csv', index = False)
test_set_preprocesed.to_csv('test_set_preprocesed_with_outliers_and_imba.csv', index = False)