In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
import preprocessing as pp

In [4]:
df_train = pd.read_csv('housing_train.csv')
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.16,33.84,36.0,2444.0,432.0,1199.0,424.0,4.1538,218800.0,<1H OCEAN
1,-121.83,37.34,26.0,1848.0,339.0,1952.0,327.0,4.087,182500.0,<1H OCEAN
2,-118.01,34.12,32.0,1937.0,332.0,922.0,340.0,3.94,278400.0,INLAND
3,-116.31,33.73,19.0,12467.0,2508.0,4086.0,1761.0,3.2846,131900.0,INLAND
4,-118.17,33.92,36.0,2447.0,503.0,1532.0,498.0,4.3667,171800.0,<1H OCEAN


In [5]:
target = 'median_house_value'
y = df_train[target]
X = df_train.drop(columns=target)[['longitude','latitude']]

In [6]:
categorical_cols = X.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency = 0.12))
])

numerical_cols = X.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train = preprocessor.fit_transform(X)
#X_test = preprocessor.transform(X_test)

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

In [9]:
cross_val_score(KNeighborsRegressor(5), X_train, y).mean()

0.7833861391694519

In [10]:
from sklearn.metrics import mean_absolute_error,r2_score

In [11]:
df_test = pd.read_csv('housing_test.csv')
y_test = df_test[target]
X_test = df_test.drop(columns=target)[['longitude','latitude']]
pp_X_test = preprocessor.transform(X_test)

In [12]:
knn = KNeighborsRegressor(5)
knn.fit(X,y)
preds = knn.predict(X_test)
score = mean_absolute_error(y_test, preds)
print('MAE:', score)
r2score = r2_score(y_test, preds)
print('R2:', r2score)

MAE: 33281.80750968992
R2: 0.7922743555665422


K-mean

In [13]:
X3_train = df_train[['longitude','latitude','median_house_value']]
X3_test = df_test[['longitude','latitude','median_house_value']]

In [14]:
from sklearn.cluster import KMeans

In [15]:
km = KMeans(n_clusters=3842)
km.fit(X)

In [16]:
train_labels = pd.Series(km.labels_)

df_price_cluster = pd.DataFrame(pd.concat([y.reset_index(drop=True),train_labels],axis=1))
df_price_cluster.rename(columns={0: "label"}, inplace=True)
df_price_cluster

Unnamed: 0,median_house_value,label
0,218800.0,967
1,182500.0,2433
2,278400.0,609
3,131900.0,135
4,171800.0,2031
...,...,...
16507,193200.0,178
16508,187500.0,2247
16509,232200.0,2461
16510,206300.0,2454


In [17]:
preds = km.predict(X_test)

In [18]:
def myfunc(pred):
    return df_price_cluster[df_price_cluster['label']==pred]['median_house_value'].median()

In [19]:
vfunc = np.vectorize(myfunc)
y_pred = vfunc(preds)

In [20]:
mean_absolute_error(y_test, y_pred)

37781.842781007756