In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/kaggle/input/chapter-2-carpricedata/data.csv", encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
# update column names to be lowercase and snake case
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

In [None]:
# Now we want to change the data in columns to make them consistent (lowercase and snake case)
# step 1 is to find which data values are string objects
df.dtypes

In [None]:
df.dtypes == 'object'
# we just want the index of the ones that are strings. 
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

In [None]:
# this is how it is done for one of them.
df['make'].str.lower().str.replace(' ', '_')

In [None]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    

df.head()

In [None]:
df.dtypes

**Data analyis**

In [None]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
#     tells us how many unique columns there are
    print(df[col].nunique())
    print()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(df.msrp[df.msrp < 100000], bins=50)

In [None]:
 np.log([0+ 1, 1+ 1, 10 + 1, 1000 + 1, 100000+ 1])

In [None]:
np.log1p([0,1,10,1000,100000])

In [None]:
price_logs = np.log1p(df.msrp)
price_logs
# we apply logarithmic distribution to get rid of the LongTail Distribution

In [None]:
sns.histplot(price_logs, bins=50)
#now we achieve normal distribution

In [None]:
df.isnull().sum()
# find the number of missing values. Keep this in mind for when we train the model.

**Setting up the Validation Framework**

In [None]:
n = len(df)
n_val = int(n * .2)
n_test = int(n * .2)
n_train = n - n_val - n_test
print(n, n_val + n_test + n_train)

In [None]:
# create a range of size n-1
idx = np.arange(n)
idx

In [None]:
# Now we shuffle with array of index we just 
# in order to make our results reproduceable, we want to set the seed
np.random.seed(2)


In [None]:
np.random.shuffle(idx)
idx
# The data set it different from the video because we have a different NumPy version

In [None]:
# we are getting the n_train value out of the idx shuffled array, which will than pull those indeces from our df
# iloc is used to select attributes from the DataFrame by their integer location, which we have in the idx array
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_val+n_train]]
df_test = df.iloc[idx[n_val+n_train:]]
# Important to shuffle the data so it's not sequential
df_train

In [None]:
# the index values are all random in the new dataset. 
# we can use reset_index to reset them
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# these are all the feature Matrix X


In [None]:
#Now to define our Y. which is the msrp
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)


In [None]:
# Delete the MSRP from the DF
# we don't want the target variable in the Dataframe to test the model.
del df_train['msrp']
del df_test['msrp']
del df_val['msrp']

In [None]:
df_train.iloc[10]

In [None]:
xi = [453, 11, 86]
# this is our example feature matrix
w0 = 7.17
w = [0.01, 0.04, 0.002]

In [None]:
def linear_regression(xi):
    n = len(xi)
    pred = w0
    for j in range(n):
        pred = pred + w[j] * xi[j]
    return pred
        

In [None]:
linear_regression(xi)

In [None]:
np.expm1(12.312)

In [None]:
np.log1p(222347.2221101062)

Linear Regression Vector Form

In [None]:
def dot(xi, w):
    n = len(xi)
    
    res = 0.0
    
    for j in range (n):
        res = res + xi[j] * w[j]
    return res
        

In [None]:
def linear_regression(xi):
    return w0 + dot(xi)
        

In [None]:
w_new = [w0] + w
w_new

In [None]:
def linear_regression(xi):
    xi = [1] + xi
    return dot(xi, w_new)
        

In [None]:
linear_regression(xi)

In [None]:
xi = [453, 11, 86]
w0 = 7.17
w =  [0.01, 0.04, 0.002]
w_new = [w0] + w

In [None]:
x1 = [1, 148, 24, 1385]
x2 = [1, 132, 25, 2031]
x10 = [1, 453, 11, 86]
X = [x1, x2, x10]
X = np.array(X)
X

In [None]:
X.dot(w_new)

In [None]:
def linear_regression(X):
    return X.dot(w_new)

In [None]:
linear_regression(X)

Solving for W - Normal Equation

In [None]:
def train_linear_regression(x, y):
    pass

In [None]:


X = [
[148, 24, 1385],
[132, 25, 2031],
[453, 11, 86],
[158, 24, 185],
[172, 25, 201],
[413, 11, 86],
[38, 54, 185],
[142, 25, 431],
[453, 31, 86]
]
X = np.array(X)
X

In [None]:
# we need to include the BiasTerm into our matrix
# ones = np.ones(X.shape[0])
# ones

In [None]:
# this allows us to create new matrix by stacking columns
# X = np.column_stack([ones, X])
# X

In [None]:
# Get the GRAM Matrix
# XTX = X.T.dot(X)
# XTX

In [None]:
# now we get the inverse
# XTX_inv = np.linalg.inv(XTX)
# XTX_inv

In [None]:
y = [ 10000,20000,15000, 20050,10000,20000,15000, 25000, 12000]

In [None]:
# w
# w_full = XTX_inv.dot(X.T).dot(y)
# w_full

In [None]:
# w0 = w_full[0]
# w = w_full[1:]
# w0, w

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
#   The normal equation
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]


In [None]:
train_linear_regression(X, y)

time to train  the Car Price baseline model

In [None]:
df_train.dtypes
df_train.columns

In [None]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg',
       'popularity']
df_train[base]

In [None]:
# Time to train our model. First lets check out Null values
df_train[base].isnull().sum()

In [None]:
# By filling them with 0, we tell our model to ignore the empty values. 
df_train[base].fillna(0).isnull().sum()

In [None]:
X_train = df_train[base].fillna(0).values
X_train, y_train

In [None]:
w0, w = train_linear_regression(X_train, y_train)
w0, w

In [None]:
y_pred = w0 + X_train.dot(w)
y_pred

In [None]:
# Plot the predictions
sns.histplot(y_pred, color="red", alpha=0.5, bins=50)
sns.histplot(y_train, color="green", alpha=0.5, bins=50)

RMSE

In [None]:
def rmse(y, y_pred):
    s_error =  (y - y_pred) ** 2
    mean_s_error = s_error.mean()
    return np.sqrt(mean_s_error)
    

In [None]:
rmse(y_train, y_pred)

Validating our model

In [None]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg',
       'popularity']


In [None]:
# create a func to prepare our X values from the DF
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X
    

In [None]:
# Train DF - df_train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)


In [None]:
rmse(y_val, y_pred)

In [None]:
sns.histplot(y_pred, color="red", alpha=0.5, bins=50)
sns.histplot(y_val, color="green", alpha=0.5, bins=50)

Feature Engineering 

In [None]:
2017 - df_train.year

In [None]:
# create a func to prepare our X values from the DF
def prepare_X(df):
#     we never want to change the data, make a copy
    df = df.copy()
    df['age'] = 2017 - df.year
    features = base + ['age']
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X
    

In [None]:
X_train = prepare_X(df_train)

In [None]:
# Train DF - df_train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

In [None]:
sns.histplot(y_pred, color="red", alpha=0.5, bins=50)
sns.histplot(y_val, color="green", alpha=0.5, bins=50)

Categorical variables

In [None]:
# for v in [2, 3, 4]:
#     df_train['num_doors_%s' % v] = (df_train.number_of_doors == v).astype(int)

In [None]:
def prepare_X(df):
#     we never want to change the data, make a copy
    df = df.copy()
    df['age'] = 2017 - df.year
    features = base + ['age']

    for v in [2, 3, 4]:
        df['num_doors_%s' % v] = (df.number_of_doors == v).astype(int)
        features.append('num_doors_%s' % v)
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X
    

In [None]:
prepare_X(df_train)

In [None]:
# Train DF - df_train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

In [None]:
# check to see most popular makes
df.make.value_counts()
df.make.value_counts().head()
list(df.make.value_counts().head().index)

In [None]:
makes = list(df.make.value_counts().head().index)
makes

In [None]:
def prepare_X(df):
#     we never want to change the data, make a copy
    df = df.copy()
    df['age'] = 2017 - df.year
    features = base + ['age']

    for v in [2, 3, 4]:
        df['num_doors_%s' % v] = (df.number_of_doors == v).astype(int)
        features.append('num_doors_%s' % v)
    
    for v in makes:
        df['make_%s' % v] = (df.make == v).astype(int)
        features.append('make_%s' %v)
    
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X
    

In [None]:
prepare_X(df_train)

In [None]:
# Train DF - df_train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

In [None]:
categories = [
    'make', 'model', 'engine_fuel_type', 'driven_wheels', 'market_category',
    'vehicle_size', 'vehicle_style']

In [None]:
categorical = {}
for c in categories:
    categorical[c] = list(df[c].value_counts().head().index)
 

In [None]:
def prepare_X(df):
    df = df.copy()
    
    df['age'] = 2017 - df['year']
    features = base + ['age']

    for v in [2, 3, 4]:
        df['num_doors_%d' % v] = (df.number_of_doors == v).astype(int)
        features.append('num_doors_%d' % v)

    for name, values in categorical.items():
        for value in values:
            df['%s_%s' % (name, value)] = (df[name] == value).astype(int)
            features.append('%s_%s' % (name, value))

    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X
    

In [None]:
# Train DF - df_train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

Regularization

In [None]:
np.eye(5) * 0.001

In [None]:
arr = [
    [4,4,4],
    [3,5,5],
    [5,1,1],
    [5,4,4],
    [7,5,5],
    [4,5,5.001]
]
A = np.array(arr) 
A

In [None]:
# XTX = [
#     [1.01, 2, 2],
#     [2, 1.01, 1],
#     [2,1,1.01]
# ]

In [None]:
# xtx_inv = np.linalg.inv(XTX)
# xtx_inv

In [None]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
#   The normal equation
    XTX = X.T.dot(X)
    XTX  = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]


In [None]:
# Train DF - df_train
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.001)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

Tuning the model

In [None]:
for r in [0.0, 0.0000001, 0.0001, 0.001, 1, 0.1, 10]:
    # Train DF - df_train
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    # prepare the df_val - we prepare it the same way. 
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

In [None]:
r = 0.001
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)
# prepare the df_val - we prepare it the same way. 
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)

print(r, w0, score)

Using the Model

In [None]:
# Now we combine the df_train and df_val to get df_full_train to use for df_test
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
X_full_train = prepare_X(df_full_train)
X_full_train

In [None]:
y_full_train = np.concatenate([y_train, y_val])

In [None]:
r = 0.001
# X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=r)
# prepare the df_val - we prepare it the same way. 
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)

print(r, w0, score)
# The similar score is good sign that our Model can generalize well

In [None]:
# now to predict price
car = dict(df_test.iloc[20])
car

In [None]:
# Now we need to turn our dictionary into model friendly df
df_small = pd.DataFrame([car])
df_small

In [None]:
X_small = prepare_X(df_small)

In [None]:
y_pred = w0 + X_small.dot(w)
y_pred = y_pred[0]

In [None]:
# Currently, it is the Log of the price
# Convert it to exponent
np.expm1(y_pred)