In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

import wrangle as wr
import regression as regr

To reduce chances of getting a Singular matrix, we can use regularization: multiply the diagonal matrix by some number, normally `zero point value`

In [2]:
# load dataset
data = '../data/cars.csv'
df = pd.read_csv(data)
df = wr.rename_columns(df)
df_train, df_val, df_test = wr.split_data(df)
y_train, y_val, y_test = wr.get_target_vars(df_train, df_val, df_test)

The  preparation function below will return a singular matrix (cell 24 in `03_categorical_vars.ipynb`)

In [3]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
def prepare_X(df: pd.DataFrame) -> pd.DataFrame:
    '''
    df: dataframe for the baseline model
    cols: numeric column names
    '''
    df = df.copy()
    features = base.copy()

    df['age'] = 2017 - df.year
    features.append('age')
    
    # got through top-5 car brands
    for v in list(df.make.value_counts().head(5).index):
        feature = 'is_make_%s' % v
        df[feature] = (df['make'] == v).astype(int)
        features.append(feature)
    
    # go through number of doors
    for v in [2, 3, 4]:
        feature = 'num_doors_%s' % v
        df[feature] = (df['number_of_doors'] == v).astype(int)
        features.append(feature)

    # top-3 transmission
    for v in df.transmission_type.value_counts().head(3):
        feature = 'transmission_%s' % v 
        df[feature] = (df.transmission_type == v).astype('uint8')
        features.append(feature)

    # top-4 engine fuel type
    for v in df.engine_fuel_type.value_counts().head(4):
        feature = 'engine_fuel_%s' % v 
        df[feature] = (df.engine_fuel_type == v).astype('uint8')
        features.append(feature)

    df_new = df[features]
    df_new = df_new.fillna(0)
    X = df_new.values
    return X

In [4]:
# copy paste train model/calculate score from cell 25 of categrical vars file
X_train = prepare_X(df_train)
w0, w = regr.train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)
print('train', regr.rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
print('validation', regr.rmse(y_val, y_pred_val))



LinAlgError: Singular matrix

In [5]:
def train_linear_regression_reg(X_train: np.array, y_train: np.array, r:int=0.01):
    # fit on train
    # added regularization
    '''
    Parameters:
        X_train: 2-D array of features
        y_train: 1-D array of target variable
    The function calculates weights for linear regression equation.
    Returns:
        w[0] -> float, bias (y-intersect)
        w[1:] -> array of weights (floats)
    '''
    # add 1 to the beginning of every vector in features
    X = np.insert(X_train, 0, np.ones(len(X_train)), axis = 1)
    # get gram matrix
    XTX = X.T.dot(X)
    # regularization
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    # inverse XTX
    XTX_inv = np.linalg.inv(XTX)
    # calculate weights
    w = XTX_inv.dot(X.T).dot(y_train)
    bias = w[0]
    weights = w[1:]

    return bias, weights

In [7]:
# same as with an error, but with regularization
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train)

y_pred = w0 + X_train.dot(w)
print('train', regr.rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
print('validation', regr.rmse(y_val, y_pred_val))

train 0.5026765950096316
validation 0.5076003111192242


Regularization helps to improve the score. In the last model of the previous file RMSE score was way too high.

In [8]:
# copy / paste from 03_categorical_vars.ipynb

base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
def prepare_X(df: pd.DataFrame) -> pd.DataFrame:
    '''
    df: dataframe for the baseline model
    cols: numeric column names
    '''
    df = df.copy()
    features = base.copy()

    df['age'] = 2017 - df.year
    features.append('age')
    
    # go through number of doors
    for v in [2, 3, 4]:
        feature = 'num_doors_%s' % v
        df[feature] = (df['number_of_doors'] == v).astype('uint8')
        features.append(feature)

    categorical_vars = ['make',  'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']
    categories = {}

    for c in categorical_vars:
        categories[c] = list(df[c].value_counts().head().index)

    for c, values in categories.items():
        for v in values:
            df['%s_%s' % (c, v)] = (df[c] == v).astype('uint8')
            features.append('%s_%s' % (c, v))


    df_new = df[features]
    df_new = df_new.fillna(0)
    X = df_new.values
    return X

In [9]:
# cope paste train model/calculate score
X_train = prepare_X(df_train)
w0, w = regr.train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)
print('train', regr.rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
print('validation', regr.rmse(y_val, y_pred_val))

train 912.5690053266312
validation 1427.9090319413954


Now let's add a regularization

In [10]:
# cope paste train model/calculate score
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train)

y_pred = w0 + X_train.dot(w)
print('train', regr.rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
print('validation', regr.rmse(y_val, y_pred_val))

train 0.45664176729583617
validation 0.45935602351429405


The RMSE score significally improved

### Tune the model

In [13]:
for r in [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r = r)

    X_val = prepare_X(df_val)
    y_pred_val = w0 + X_val.dot(w)
    score = regr.rmse(y_val, y_pred_val)

    print(r, w0, score)
    

0.0 -8.271740080240131e+16 1427.9090319413954
1e-05 2.9492300146084087 0.4593257840345944
0.0001 6.238283384441287 0.45932602468209155
0.001 6.295212133272979 0.45932859929368697
0.01 6.2727424567851795 0.45935602351429405
0.1 6.078293836119147 0.4597472961194152
1 5.3187316820049295 0.46388171086208285
10 4.189946611709698 0.4815516830146857


In [15]:
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

for r in [0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, regr.rmse(y_val, y_pred))

 1e-06 0.4593257343441555
0.0001 0.45932602468209155
 0.001 0.45932859929368697
  0.01 0.45935602351429405
   0.1 0.4597472961194152
     1 0.46388171086208285
     5 0.4727830448928472
    10 0.4815516830146857


In [18]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.000001)

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
print('validation:', regr.rmse(y_val, y_pred))

X_test = prepare_X(df_test)
y_pred = w_0 + X_test.dot(w)
print('test:', regr.rmse(y_test, y_pred))

validation: 0.4593257343441555
test: 0.47395909187296353


### Using the model