In [1]:
import pandas as pd
import numpy as np


In [2]:
from google.colab import drive, files

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
train_df = pd.read_csv('/content/drive/My Drive/housin/housing.csv')
test_df = pd.read_csv('/content/drive/My Drive/housin/housing.csv')

In [5]:
train_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [6]:
train_df = train_df.drop(columns=['ocean_proximity'])
test_df = test_df.drop(columns=['ocean_proximity'])

In [7]:
test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [8]:
X_train, y_train = train_df.to_numpy()[:, :-1], train_df.to_numpy()[:, -1]
X_test, y_test = test_df.to_numpy()[:, :-1], test_df.to_numpy()[:, -1]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((20640, 8), (20640,), (20640, 8), (20640,))

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from copy import deepcopy

std_scaler = StandardScaler().fit(X_train[:, :2])
min_max_scaler = MinMaxScaler().fit(X_train[:, 2:])



In [10]:
from sklearn.impute import SimpleImputer # Importing SimpleImputer

def preprocessor(X):
  A = np.copy(X)
  # Impute NaNs using the mean before scaling
  imputer = SimpleImputer(strategy='mean')
  A = imputer.fit_transform(A)
  A[:, :2] = std_scaler.transform(A[:, :2])
  A[:, 2:] = min_max_scaler.transform(A[:, 2:])
  return A

the preprocessor function takes your data, applies standardization to the first two columns, applies min-max scaling to the remaining columns, and returns the transformed data. This preprocessing can improve the performance of your machine learning model by ensuring that features are on similar scales and distributions

In [11]:
preprocessor(X_test)

array([[-1.32783522,  1.05254828,  0.78431373, ...,  0.00894083,
         0.02055583,  0.53966842],
       [-1.32284391,  1.04318455,  0.39215686, ...,  0.0672104 ,
         0.18697583,  0.53802706],
       [-1.33282653,  1.03850269,  1.        , ...,  0.01381765,
         0.02894261,  0.46602805],
       ...,
       [-0.8237132 ,  1.77823747,  0.31372549, ...,  0.0281398 ,
         0.07104095,  0.08276438],
       [-0.87362627,  1.77823747,  0.33333333, ...,  0.02068444,
         0.05722743,  0.09429525],
       [-0.83369581,  1.75014627,  0.29411765, ...,  0.03879032,
         0.08699227,  0.13025338]])

In [12]:
X_test

array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])

In [13]:
preprocess_transformer = FunctionTransformer(preprocessor)
preprocess_transformer

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('Scaler', preprocess_transformer),
               ('Linear Regression', LinearRegression())])

p1

In [15]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
  p.fit(X_train, y_train)
  train_preds = p.predict(X_train)
  test_preds = p.predict(X_test)
  print('Training error:' +str(mean_absolute_error(train_preds, y_train)))
  print('Test error:' +str(mean_absolute_error(test_preds, y_test)))

In [16]:
fit_and_print(p1)

Training error:50909.49169564505
Test error:50909.49169564505


In [17]:
from sklearn.neighbors import KNeighborsRegressor as KNR

p2 = Pipeline([('Scaler', preprocess_transformer),
               ('KNN Regression', KNR(n_neighbors=7))])

fit_and_print(p2)

Training error:29738.53952796235
Test error:29738.53952796235


In [19]:
from sklearn.ensemble import RandomForestRegressor as RFR

p3 = Pipeline([('Scaler', preprocess_transformer),
               ('Random Forest', RFR(n_estimators=10, max_depth=7))])

fit_and_print(p3)

Training error:41686.5371424645
Test error:41686.5371424645
