In [1]:
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
df = pd.read_csv('supershops.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [4]:
df = df.dropna() # remove all nan

In [5]:
x = df.drop(['Profit','Area'], axis=1)

In [6]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [7]:
y = df[['Profit']]

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [None]:
xtrain.shape

In [None]:
df.corr()

# OLS

In [10]:
model = LinearRegression() # polynomial regression with degree 1
model.fit(xtrain, ytrain)

LinearRegression()

In [11]:
y_pred = model.predict(xtest)

In [12]:
y_pred

array([[ 90707.18524202],
       [166377.24276987],
       [124018.59727829],
       [ 93252.51801345],
       [ 97588.41924098],
       [ 68948.39245538],
       [ 89037.14295216],
       [ 73472.98068535],
       [159657.23912108],
       [129149.84623501],
       [128674.69774365],
       [ 88409.42998689],
       [ 96436.18820079]])

In [13]:
mse = mean_squared_error(ytest, y_pred)
print('MSE:', mse)

MSE: 101360809.28512499


In [14]:
model.score(xtest, ytest)

0.8744319145336102

# Polynomial

In [16]:
df.corr()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
Marketing Spend,1.0,0.227141,0.718574,0.937853
Administration,0.227141,1.0,0.009534,0.197201
Transport,0.718574,0.009534,1.0,0.782578
Profit,0.937853,0.197201,0.782578,1.0


In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


# Degree 2

In [None]:
poly = PolynomialFeatures(degree=2) # polynomial regression with degree 2
X_poly = poly.fit_transform(x)

In [None]:
pd.DataFrame(X_poly).head()

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly, y, test_size=0.25, random_state=0)

In [None]:
xtrain.shape

In [None]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [None]:
y_pred2 = model2.predict(xtest)

In [None]:
y_pred2

In [None]:
mse = mean_squared_error(ytest, y_pred2)
print('MSE:', mse)

In [None]:
model2.score(xtest, ytest) #testing score

In [None]:
model2.score(xtrain, ytrain) # training score

# Degree 3

In [None]:
poly = PolynomialFeatures(degree=3) # polynomial regression with degree 2
X_poly_deg3 = poly.fit_transform(x)

In [None]:
X_poly_deg3.shape

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly_deg3, y, test_size=0.25, random_state=0)

In [None]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [4]:
model2.score(xtest, ytest) # testing

NameError: name 'model2' is not defined

In [None]:
model2.score(xtrain, ytrain) # training

# Let's talk about Regularization

In [5]:
x = df.drop(['Profit','Area'], axis=1)
y = df[['Profit']]

NameError: name 'df' is not defined

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [None]:
xtrain.head()

# L1 Regularization with LR - Lasso

In [None]:
lasso_model = Lasso(alpha=0.1) # Tune alpha parameter
lasso_model.fit(xtrain, ytrain)

# L2 Regularization with LR - Ridge

In [None]:
ridge_model = Ridge(alpha=0.1) # Tune alpha parameter
ridge_model.fit(xtrain, ytrain)

# Performance

In [None]:
lasso_predictions = lasso_model.predict(xtest)
lasso_mse = mean_squared_error(ytest, lasso_predictions)

ridge_predictions = ridge_model.predict(xtest)
ridge_mse = mean_squared_error(ytest, ridge_predictions)

print('Lasso MSE:', lasso_mse)
print('Ridge MSE:', ridge_mse)

In [None]:
lasso_model.score(xtest, ytest)

In [None]:
ridge_model.score(xtest, ytest)