In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from geopy import distance


In [2]:
def total_bedrooms_fillna(X):

    op_for_nans = X[['total_bedrooms', 'ocean_proximity']].loc[
        X['total_bedrooms'].isna()]['ocean_proximity'].value_counts()
    op_for_nans = op_for_nans.index.to_list()

    for val in op_for_nans:
        median = X[X['ocean_proximity'] == val]['total_bedrooms'].median()
        X.loc[X['ocean_proximity']==val,'total_bedrooms'] =  \
            X[X['ocean_proximity']==val]['total_bedrooms'].fillna(median)

    return X

In [3]:
outlier_cols = ['total_rooms','total_bedrooms', 'population',
                'households', 'median_income']

In [4]:
def iqr_removal(X):
    def outlier_treatment(datacolumn):
        sorted(datacolumn)
        q1, q3 = np.percentile(datacolumn , [25,75])
        iqr = q3 - q1
        lower_range = q1 - (1.5 * iqr)
        upper_range = q3 + (1.5 * iqr)
        return lower_range,upper_range

    for col in outlier_cols:
        lower_range, upper_range = outlier_treatment(X[col])
        outliers = X.loc[(X[col] > upper_range) | (X[col] < lower_range)]
        outliers_indexes = outliers.index
        X = X.drop(outliers_indexes)

    return X

In [5]:
def log_trans(X):
    outlier_cols = ['total_rooms','total_bedrooms', 'population',
                'households', 'median_income']
    outlier_cols.remove('total_rooms')
    for col in outlier_cols:
        X[col] = np.log(X[col])

    return X

In [6]:
def feature_add(X):
    X["rooms_per_household"]=X["total_rooms"]/X["households"]
    X["bedrooms_per_room"]=X["total_bedrooms"]/X["total_rooms"]
    X["population_per_household"]=X["population"]/X["households"]

    return X

In [7]:
def big_city_dist(X):
    cities_coords = pd.read_csv('cal_cities_lat_long.csv')

    cities_pop = pd.read_excel('1990 to 2000 Population Changes in California Cities and Counties (XLS).xlsx')
    cities_pop = cities_pop.drop(range(0, 12), axis=0)
    cities_pop = cities_pop.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], 1)

    cities_pop.columns = ['City', 'Population']

    cities_pop = cities_pop.dropna()

    counties_indexes = cities_pop[cities_pop['City'].str.contains("County")].index
    cities_pop = cities_pop.drop(counties_indexes)


    cities_pop = cities_pop.sort_values('City')
    cities_coords.columns = ['City', 'Latitude', 'Longitude']
    cities_coords = cities_coords.sort_values('City')


    cities_pop['City'] = cities_pop['City'].transform(lambda x: x.str.replace(' city', ''))
    cities_pop['City'] = cities_pop['City'].transform(lambda x: x.str.replace(' town', ''))

    cities_pop = cities_pop.loc[cities_pop['Population'] > 400000]

    cities_coords = cities_coords.loc[cities_coords['City'].isin(cities_pop['City'])]


    X['nearest_big_city'] = np.nan
    X['nearest_big_city_dist'] = np.nan


    for house_index, house_row in X.iterrows():
        house_lat_long = (house_row['latitude'], house_row['longitude'])

        closest_city = None
        for city_index, city_row in cities_coords.iterrows():
            city_lat_long = (round(city_row['Latitude'], 6), round(city_row['Longitude'], 6))
            dist = distance.distance(house_lat_long, city_lat_long).kilometers

            if closest_city is None:
                closest_city = (city_row['City'], dist)
            elif dist < closest_city[1]:
                closest_city = (city_row['City'], dist)

        X.loc[house_index, 'nearest_big_city'] = closest_city[0]
        X.loc[house_index, 'nearest_big_city_dist'] = closest_city[1]

    return X

In [8]:
def high_corr_feat_drop(X):
    X = X.drop(['population', 'total_bedrooms', 'total_rooms', 'households'], 1)

    return X

In [9]:
def onehot_enc(X):
    df_cat = ['ocean_proximity', 'nearest_big_city']

    ohe = OneHotEncoder()

    enc_df = pd.DataFrame(ohe.fit_transform(
        X[df_cat]).toarray())

    pca = PCA(n_components=7)
    enc_df = pca.fit_transform(enc_df)

    enc_df = pd.DataFrame(enc_df)
    print(enc_df.columns.tolist())
    enc_df.columns = [f'pc{i}' for i in range(1, enc_df.shape[1] + 1)]
    print('New:')
    print(enc_df.columns.tolist())
    enc_df = enc_df.reset_index(drop=True)
    X = X.reset_index(drop=True)

    X = pd.concat([X, enc_df], axis=1)

    X = X.drop(df_cat, axis=1)
    print('X columns:')
    print(X.columns.tolist())
    return X

In [10]:
def drop_unimportant_feat(X):
    X = X.drop(['pc2', 'pc3', 'pc5'], 1)

    return X

In [11]:
df = pd.read_csv('train_df.csv')

In [12]:
# couldnt do this in pipeline bcs it deleted rows only in X and not y
print(df.shape)
df = iqr_removal(df)
df = df.loc[df['median_house_value'] < 500001, :]
print(df.shape)

(16512, 10)
(13808, 10)


In [13]:
X_train = df.drop('median_house_value', 1)
y_train = df['median_house_value']

X_test = pd.read_csv('test_df.csv')
y_test = pd.read_csv('test_answers.csv')['median_house_value']

outlier_incomes = X_test.loc[~(X_test['median_income'] == 15.0001)]
X_test = outlier_incomes
y_test = y_test.loc[outlier_incomes.index]

In [14]:
pipeline = Pipeline([
    ('bedrooms_fillna', FunctionTransformer(total_bedrooms_fillna)),
    ('log_trans', FunctionTransformer(log_trans)),
    ('feature_add', FunctionTransformer(feature_add)),
    ('big_city_dist', FunctionTransformer(big_city_dist)),
    ('high_corr_feat_drop', FunctionTransformer(high_corr_feat_drop)),
    ('onehot_enc', FunctionTransformer(onehot_enc)),
    ('drop_unimportant_feat', FunctionTransformer(drop_unimportant_feat)),
    ('standard_scaler', StandardScaler()),
    ('R', RandomForestRegressor(random_state=42))
])


In [15]:
pipeline.fit(X_train, y_train)

[0, 1, 2, 3, 4, 5, 6]
New:
['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7']
X columns:
['longitude', 'latitude', 'housing_median_age', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household', 'nearest_big_city_dist', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7']


Pipeline(steps=[('bedrooms_fillna',
                 FunctionTransformer(func=<function total_bedrooms_fillna at 0x00000242EDBA7550>)),
                ('log_trans',
                 FunctionTransformer(func=<function log_trans at 0x00000242F5392550>)),
                ('feature_add',
                 FunctionTransformer(func=<function feature_add at 0x00000242F5392D30>)),
                ('big_city_dist',
                 FunctionTransformer(func=<function big_city_dist at 0x0000...
                ('high_corr_feat_drop',
                 FunctionTransformer(func=<function high_corr_feat_drop at 0x00000242F53D3790>)),
                ('onehot_enc',
                 FunctionTransformer(func=<function onehot_enc at 0x00000242EDBDA160>)),
                ('drop_unimportant_feat',
                 FunctionTransformer(func=<function drop_unimportant_feat at 0x00000242F5392790>)),
                ('standard_scaler', StandardScaler()),
                ('R', RandomForestRegressor(random_state

In [16]:
pipeline.score(X_test, y_test)


[0, 1, 2, 3, 4, 5, 6]
New:
['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7']
X columns:
['longitude', 'latitude', 'housing_median_age', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household', 'nearest_big_city_dist', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7']


0.7817295324754643

In [17]:
# Feature Importances, results dropped in pipeline:

# sns.set(font_scale= 1)
# a4_dims = (11.7, 8.27)
# fig, ax = plt.subplots(figsize=a4_dims)
# 
# features = ['longitude', 'latitude', 'housing_median_age',
#             'median_income', 'rooms_per_household', 'bedrooms_per_room',
#             'population_per_household', 'nearest_big_city_dist',
#             'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7']
# importances = pipeline['R'].feature_importances_
# indices = np.argsort(importances)
# 
# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [features[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()