In [1]:
import random   as rnd
import pandas   as pd
import numpy    as np
import pandas   as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.linear_model    import LinearRegression
from sklearn                 import preprocessing
from sklearn.preprocessing   import MinMaxScaler, StandardScaler

In [2]:
def ocean_prox_flatten(data):
    ov = data.ocean_proximity.values
    data['1h_ocean']    = [1 if i=='<1H OCEAN'  else 0 for i in ov]
    data['island']      = [1 if i=='ISLAND'     else 0 for i in ov]
    data['inland']      = [1 if i=='INLAND'     else 0 for i in ov]
    data['near_ocean']  = [1 if i=='NEAR OCEAN' else 0 for i in ov]
    data['near_bay']    = [1 if i=='NEAR BAY'   else 0 for i in ov]
    data.drop(columns=['ocean_proximity'], inplace=True)
    return data

In [3]:
def deal_with_missing_values(data):
    notna = data.total_bedrooms.notna()
    nna_rooms = data.total_rooms.values[notna]
    nna_beds = data.total_bedrooms.values[notna]
    
    model = LinearRegression()
    reshape = lambda f, x, y: f(x.reshape(-1,1), y.reshape(-1,1))
    
    reshape(model.fit, nna_rooms, nna_beds)
    score = reshape(model.score, nna_rooms, nna_beds)

    isna = data.total_bedrooms.isna()
    missing_bedrooms = model.predict(data.total_rooms.values[isna].reshape(-1,1))
    data.total_bedrooms.loc[isna] = np.squeeze(missing_bedrooms)
    
    return data

In [4]:
def deal_with_missing_values_median(data):
    data['total_bedrooms'].fillna(data['total_bedrooms'].mean(), inplace = True)
    return data

In [5]:
def normalize(data):
    scaler = MinMaxScaler() 
    scaled_values = scaler.fit_transform(data) 
    data.loc[:,:] = scaled_values
    return data

In [6]:
def encode(data):
    lab_enc = preprocessing.LabelEncoder()
    encoded = [lab_enc.fit_transform(i) for i in data.values]
    data.loc[:,:] = encoded
    return data

In [7]:
def create_new_features(data):
    data["avg_rooms"] = data["total_rooms"]/data["households"]
    data["people_per_house"] = data["population"]/data["households"]
    data = data.drop("total_rooms", axis = 1)
    data = data.drop("households", axis = 1)    
    data = data.drop("population", axis = 1)
    return data

In [8]:
def remove_outliers(data):
    data=data.loc[data['median_house_value']<500001,:]
    data=data[data['population']<25000]
    return data

In [9]:
def preprocess(data, normalized, encoded, dataframe):
    p_data = data.copy()
    
    p_data = ocean_prox_flatten(p_data)
    
    p_data = deal_with_missing_values(p_data)
    
    #p_data = remove_outliers(p_data)
    
    p_data = create_new_features(p_data)
    
    if(normalized == True):p_data = normalize(p_data)
    if(encoded == True):p_data = encode(p_data)
    
    holdout = p_data.sample(n=1000)
    p_data = p_data.drop(holdout.index)

    y = p_data.median_house_value
    X = p_data.drop(columns=['median_house_value'], inplace=False)

    y_h = holdout.median_house_value
    X_h = holdout.drop(columns=['median_house_value'], inplace=False)
    
    if(not dataframe):
        y = y.values.reshape(-1,1)
        X = X.values
        y_h = y_h.values.reshape(-1,1)
        X_h = X_h.values

    return X, y, X_h, y_h