Ref: Machine Learning Notebooks, 3rd edition https://github.com/ageron/handson-ml3

- Observation #1 -- The distribution was not normal, so we applied a log transform and that increased our accuracy. 

- Artist
- Album
- Track -- this table might have all the features/attributes

- Insight #1 -- X is correlated with Y


# Deliverables
1. Presentation (Notebook)
2. DagHub repo link
3. Docker repo link
4. Github repo link
5. API Link
6. Streamlit link

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
housing = pd.read_csv("housing.csv")
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

train, test = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)
train.drop("income_cat", axis=1, inplace=True)
test.drop("income_cat", axis=1, inplace=True)

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

class Preprocessor(BaseEstimator, TransformerMixin):
    # Train our custom preprocessors
    numerical_columns = [
        'longitude',
        'latitude',
        'housing_median_age',
        'total_rooms',
        'total_bedrooms',
        'population',
        'households',
        'median_income',
    ]
    categorical_columns = [
        'ocean_proximity'
    ]

    def fit(self, X, y=None):

        # Create and fit simple imputer
        self.imputer = SimpleImputer(strategy='median')
        self.imputer.fit(X[self.numerical_columns])

        # Create and fit Standard Scaler
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.numerical_columns])

        # Create and fit one hot encoder
        self.onehot = OneHotEncoder(handle_unknown='ignore')
        self.onehot.fit(X[self.categorical_columns])

        return self

    def transform(self, X):

        # Apply simple imputer
        imputed_cols = self.imputer.transform(X[self.numerical_columns])
        onehot_cols = self.onehot.transform(X[self.categorical_columns])

        # Copy the df
        transformed_df = X.copy()

        # Apply transformed columns
        transformed_df[self.numerical_columns] = imputed_cols
        transformed_df[self.numerical_columns] = self.scaler.transform(transformed_df[self.numerical_columns])

        # Drop existing categorical columns and replace with one hot equivalent
        transformed_df = transformed_df.drop(self.categorical_columns, axis=1)
        transformed_df[self.onehot.get_feature_names_out()] = onehot_cols.toarray().astype(int)

        return transformed_df


from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
y_train = train['median_house_value']
X_train = train.drop('median_house_value', axis=1)

y_test = test['median_house_value']
X_test = test.drop('median_house_value', axis=1)

rfg = make_pipeline(Preprocessor(), RandomForestRegressor())

In [18]:
rfg.fit(X_train, y_train)

NameError: name 'rgf' is not defined

In [35]:
import json
print(json.dumps(X_test.iloc[0].to_dict(), indent=2))

{
  "longitude": -118.39,
  "latitude": 34.12,
  "housing_median_age": 29.0,
  "total_rooms": 6447.0,
  "total_bedrooms": 1012.0,
  "population": 2184.0,
  "households": 960.0,
  "median_income": 8.2816,
  "ocean_proximity": "<1H OCEAN"
}


In [19]:
from sklearn.metrics import mean_absolute_error
y_pred_train = rfg.predict(X_train)
mean_absolute_error(y_train, y_pred_train)

11856.918169815892

In [20]:
y_pred_test = rfg.predict(X_test)
mean_absolute_error(y_test, y_pred_test)

30883.678573158915

In [26]:
from dill import dump, load

with open('rfg_model.pkl', 'wb') as f:
    dump(rfg, f)

In [28]:
with open('rfg_model.pkl', 'rb') as f:
    reloaded_model = load(f)

In [30]:
y_pred_test = reloaded_model.predict(X_test)
mean_absolute_error(y_test, y_pred_test)

30883.678573158915

In [41]:
payload = {
  "longitude": -118.39,
  "latitude": 34.12,
  "housing_median_age": 29.0,
  "total_rooms": 6447.0,
  "total_bedrooms": 1012.0,
  "population": 2184.0,
  "households": 960.0,
  "median_income": 8.2816,
  "ocean_proximity": "<1H OCEAN"
}

df = pd.DataFrame([payload.values()], columns=payload.keys())
df
a = reloaded_model.predict(df)


In [43]:
type(a[0])

numpy.float64