# OLS (Ordinary Least Squares)
- OLS is a regression algorithm that relies on core linear algebra principles. 
- The goal of the algoirthm is to find a linear relationship between features and target variables.
- The algorithm does this by minimizing the sum of squares of the differences between observed and predicted target values.

## OLS is best used when
- The relationship between variables is believed to be linear.
- There is few outliers in the dataset. 

## Formula is 
$ y = X\beta + \epsilon $
- Beta is the vector of coefficients estimated from data(we will calculate this below)
- epsilon is the residuals(error) of results.



In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Data
- The dataset used is a dataset from Kaggle on laptop prices scraped from amazon
- For this dive in to attempt to seek the most linear relationship, we will just be looking at Apple laptops so we need to do some preprocessing.

In [2]:
df = pd.read_csv('/Users/nicholasmcgourthy/Downloads/amazon_laptop_prices_v01.csv')

In [3]:
df.head()

Unnamed: 0,brand,model,screen_size,color,harddisk,cpu,ram,OS,special_features,graphics,graphics_coprocessor,cpu_speed,rating,price
0,ROKC,,14 Inches,Blue,1000 GB,Intel Core i7,8 GB,Windows 11,,Integrated,Intel,1.2 GHz,,$589.99
1,HP,,15.6 Inches,Silver,1000 GB,Intel Core i5,64 GB,Windows 11 Pro,Backlit Keyboard,Integrated,Intel,,4.5,$999.99
2,MSI,Vector GP66 12UGS-267,15.66 Inches,Core Black,,Intel Core i9,32 GB,Windows 11 Home,,Dedicated,,1.8 GHz,5.0,"$1,599.00"
3,Apple,MacBook Air,13.3 Inches,Silver,256 GB,Unknown,8 GB,Mac OS,Backlit Keyboard,Integrated,,,4.8,$689.99
4,Apple,MacBook Air,15.3 Inches,Midnight,256 GB,Unknown,8 GB,Mac OS,,Integrated,,,4.8,"$1,144.48"


In [4]:
# New dataframe with just apple brands
apple_laptop = df[df['brand'].str.lower() == 'apple']

In [5]:
# Set features
X = ['model', 'screen_size', 'harddisk', 'ram']

In [6]:
X = apple_laptop[X]
y = apple_laptop['price']

In [7]:
# get rows in X that have NAN variables 
drop_idx = X[X.isnull().any(axis=1)].index
print(drop_idx)

Index([264, 350, 373, 383, 397, 511, 995, 1010, 1017, 1727, 2450, 2796, 3140], dtype='int64')


In [8]:
# drop rows in X and y
X.drop(index = drop_idx, inplace = True)
y.drop(index = drop_idx, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(index = drop_idx, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.drop(index = drop_idx, inplace = True)


In [12]:
X.head(16)

Unnamed: 0,model,screen_size,harddisk,ram
3,MacBook Air,13.3 Inches,256 GB,8 GB
4,MacBook Air,15.3 Inches,256 GB,8 GB
6,MacBook Pro,13.3 Inches,256 GB,8 GB
84,MacBook Pro,16.2 Inches,512 GB,32 GB
86,"2022 Apple MacBook Air M2, 16GB RAM, 512GB Sto...",13.6 Inches,512 GB,16 GB
95,MacBook Pro,13 Inches,256 GB,16 GB
111,"2022 Apple MacBook Air M2, 16GB RAM, 256GB Sto...",13.6 Inches,256 GB,16 GB
150,MacBook Pro,16 Inches,1 TB,32 GB
345,MacBook Air,13.3 Inches,256 GB,8 GB
624,MacBook Air M2,13.6 Inches,512 GB,16 GB


In [14]:
# Lets convert the features to numerical values and encode model.
X['screen_size'] = X['screen_size'].str.replace(' Inches', '').astype(float)
X['harddisk'] = X['harddisk'].str.replace(' GB', '').str.replace(' TB', '000').astype(int)
X['ram'] = X['ram'].str.replace(' GB', '').astype(int)

# For some reason two rows have the name fully spelled out with ram and storage so lets normalize this 
condition = X['model'].str.contains('2022 Apple MacBook Air M2')
X.loc[condition, 'model'] = 'MacBook Air M2'


AttributeError: Can only use .str accessor with string values!

In [18]:
# verify changes
X.head(16)

Unnamed: 0,model,screen_size,harddisk,ram
3,MacBook Air,13.3,256,8
4,MacBook Air,15.3,256,8
6,MacBook Pro,13.3,256,8
84,MacBook Pro,16.2,512,32
86,MacBook Air M2,13.6,512,16
95,MacBook Pro,13.0,256,16
111,MacBook Air M2,13.6,256,16
150,MacBook Pro,16.0,1000,32
345,MacBook Air,13.3,256,8
624,MacBook Air M2,13.6,512,16


$ \beta = (X^T*X)^{-1}*X^T*Y $

In [26]:
# Lets encode model to something our regression model can use
X['Model_encoded'] = pd.factorize(X['model'])[0]
# Drop the categorical value
X.drop('model', axis = 1, inplace = True)
# verify changes
X.head(16)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Model_encoded'] = pd.factorize(X['model'])[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('model', axis = 1, inplace = True)


Unnamed: 0,screen_size,harddisk,ram,Model_encoded
3,13.3,256,8,0
4,15.3,256,8,0
6,13.3,256,8,1
84,16.2,512,32,1
86,13.6,512,16,2
95,13.0,256,16,1
111,13.6,256,16,2
150,16.0,1000,32,1
345,13.3,256,8,0
624,13.6,512,16,2


In [27]:
# Lets convert X to a numpy array, we need to set a column to 1 to estimate the intercept term
X_np = X.values
ones = np.ones(X_np.shape[0]).reshape(-1,1)
X_np = np.hstack([ones, X_np])
print(X_np)

[[   1.    13.3  256.     8.     0. ]
 [   1.    15.3  256.     8.     0. ]
 [   1.    13.3  256.     8.     1. ]
 [   1.    16.2  512.    32.     1. ]
 [   1.    13.6  512.    16.     2. ]
 [   1.    13.   256.    16.     1. ]
 [   1.    13.6  256.    16.     2. ]
 [   1.    16.  1000.    32.     1. ]
 [   1.    13.3  256.     8.     0. ]
 [   1.    13.6  512.    16.     2. ]
 [   1.    16.  1000.    16.     1. ]
 [   1.    13.   512.    16.     1. ]
 [   1.    12.   512.     8.     3. ]
 [   1.    13.3  128.     8.     4. ]
 [   1.    14.  1000.    32.     1. ]
 [   1.    13.   128.     8.     0. ]]


In [37]:
# Before we jump ahead need to remove the $ and , from the y.

y = y.str.replace('$', '').str.replace(',','').astype(float)

AttributeError: Can only use .str accessor with string values!

In [39]:
# Now that we have our numpy array lets do some math !
X_np = np.array(X_np,dtype = np.float64)
Y_np = np.array(y, dtype = np.float64)

X_transpose = X_np.T
beta = np.linalg.inv(X_transpose.dot(X_np)).dot(X_transpose).dot(Y_np)

In [47]:
custom_pred = X_np.dot(beta)

In [48]:
print(custom_pred)
print(y)

[1060.88068147  947.07164647  959.37562653 2308.21948978 1434.70362739
 1436.42825826 1300.78049282 2574.89136856 1060.88068147 1434.70362739
 1654.9288156  1570.35139283  964.26452398  587.89889443 2688.70040356
 1010.99046944]
3        689.99
4       1144.48
6        965.08
84      1899.00
86      1459.94
95      1457.99
111     1255.94
150     3059.00
345     1068.99
624     1588.42
774     1419.00
930     1657.99
1922     779.95
2112     689.95
2153    2611.35
3153    1248.00
Name: price, dtype: float64


In [49]:
# Compare against scikit-learn
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X,y)
sklearn_pred = model.predict(X)
print(sklearn_pred)

[1060.88068147  947.07164647  959.37562653 2308.21948978 1434.70362739
 1436.42825826 1300.78049282 2574.89136856 1060.88068147 1434.70362739
 1654.9288156  1570.35139283  964.26452398  587.89889443 2688.70040356
 1010.99046944]


In [51]:
# The results where identical as expected, lets do some evaluation calculations

y_pred = np.array(custom_pred)
y_true = np.array(y)

mae = np.mean(np.abs(y_true - y_pred))
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)

print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)


MAE: 165.3180509021369
MSE: 48442.31228386455
RMSE: 220.09614327348982
