In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
import preprocessing as pp

In [4]:
df = pd.read_csv('../data_exploration/housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
median_house_value     3842
ocean_proximity           5
dtype: int64

In [6]:
pipeline_df = Pipeline(steps=[
    ('drop_rows', pp.Drop_target())
])

df2 = pipeline_df.fit_transform(df)

In [7]:
df2.shape[0]/df2['median_house_value'].nunique()

5.1223639677167405

In [8]:
target = 'median_house_value'
y = df2[target]
X = df2.drop(columns=target)[['longitude','latitude']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [9]:
categorical_cols = X_train.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency = 0.12))
])

numerical_cols = X_train.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#X_train = preprocessor.fit_transform(X_train)
#X_test = preprocessor.transform(X_test)

In [11]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

In [12]:
cross_val_score(KNeighborsRegressor(5), X_train, y_train).mean()

0.7510339151384587

In [13]:
from sklearn.metrics import mean_absolute_error

In [14]:
knn = KNeighborsRegressor(5)
knn.fit(X_train,y_train)
preds = knn.predict(X_test)

mean_absolute_error(y_test, preds)

31177.392630241422

In [15]:
X_train

Unnamed: 0,longitude,latitude
776,-122.10,37.64
15010,-117.03,32.75
18139,-122.01,37.34
3454,-118.41,34.32
10595,-117.79,33.68
...,...,...
13712,-117.22,34.07
20613,-121.55,39.09
10330,-117.76,33.83
11371,-117.98,33.71


K-mean

In [16]:
df3 = df2[['longitude','latitude','median_house_value']]

In [17]:
X3_train, X3_test = train_test_split(df3, test_size=.2, random_state=0)

In [18]:
from sklearn.cluster import KMeans

In [19]:
km = KMeans(n_clusters=3842)
km.fit(X_train)

In [20]:
train_labels = pd.Series(km.labels_)

df_price_cluster = pd.DataFrame(pd.concat([y_train.reset_index(drop=True),train_labels],axis=1))
df_price_cluster.rename(columns={0: "label"}, inplace=True)
df_price_cluster

Unnamed: 0,median_house_value,label
0,206700.0,3402
1,187300.0,3109
2,344200.0,3149
3,203400.0,2111
4,326100.0,1906
...,...,...
15735,129200.0,292
15736,53400.0,9
15737,300700.0,1835
15738,229600.0,385


In [21]:
preds = km.predict(X_test)

In [22]:
def myfunc(pred):
    return df_price_cluster[df_price_cluster['label']==pred]['median_house_value'].median()

In [23]:
vfunc = np.vectorize(myfunc)
y_pred = vfunc(preds)

In [24]:
mean_absolute_error(y_test, y_pred)

34905.794409148664