In [1]:
import pandas as pd 
import sklearn 
from sklearn.linear_model import LinearRegression, Lasso, Ridge 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error 

In [2]:
#https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
#https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
#x,y = make_regression(n_samples=10000, n_features=50,noise=0.1)

In [3]:
#x = pd.DataFrame(x)

In [4]:
df = pd.read_csv(r'data/supershops.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [5]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [6]:
df.Transport = df.Transport.fillna(df.Transport.mean())

In [7]:
df.drop('Area', axis=1,inplace=True)

In [8]:
df1 = df.copy()

In [9]:
x = df.drop('Profit', axis=1)

In [10]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [11]:
y=df[['Profit']]

In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [13]:
xtrain.shape

(37, 3)

In [14]:
df.corr()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
Marketing Spend,1.0,0.230437,0.718001,0.937948
Administration,0.230437,1.0,0.009402,0.200717
Transport,0.718001,0.009402,1.0,0.781996
Profit,0.937948,0.200717,0.781996,1.0


# OLS

In [15]:
model = LinearRegression() # polynomial regression with degree 1
model.fit(xtrain, ytrain)

In [16]:
y_pred = model.predict(xtest)

In [17]:
y_pred

array([[103811.61401638],
       [128090.65172169],
       [128955.46611288],
       [ 73818.5411586 ],
       [176719.46824888],
       [123418.48179269],
       [ 68943.67844084],
       [ 94839.10021303],
       [119498.55653098],
       [164748.07766423],
       [ 97678.19628921],
       [ 88998.80999378],
       [107056.69891647]])

In [18]:
mse = mean_squared_error(ytest, y_pred)
print('MSE:', mse)

MSE: 109132410.44475438


In [19]:
model.score(xtest, ytest)

0.8988383259640066

# Polynomial

In [20]:
df.corr()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
Marketing Spend,1.0,0.230437,0.718001,0.937948
Administration,0.230437,1.0,0.009402,0.200717
Transport,0.718001,0.009402,1.0,0.781996
Profit,0.937948,0.200717,0.781996,1.0


In [21]:
from sklearn.preprocessing import PolynomialFeatures

In [22]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


# Degree 2

In [23]:
poly = PolynomialFeatures(degree=2) # polynomial regression with degree 2 
X_poly = poly.fit_transform(x)

In [24]:
pd.DataFrame(X_poly).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,114523.61,136897.8,471784.1,13115660000.0,15678030000.0,54030420000.0,18741010000.0,64586210000.0,222580200000.0
1,1.0,162597.7,151377.59,443898.53,26438010000.0,24613650000.0,72176880000.0,22915170000.0,67196290000.0,197045900000.0
2,1.0,153441.51,101145.55,407934.54,23544300000.0,15519930000.0,62594090000.0,10230420000.0,41260760000.0,166410600000.0
3,1.0,144372.41,118671.85,383199.62,20843390000.0,17132940000.0,55323450000.0,14083010000.0,45475010000.0,146841900000.0
4,1.0,142107.34,91391.77,366168.42,20194500000.0,12987440000.0,52035220000.0,8352456000.0,33464780000.0,134079300000.0


In [25]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly, y, test_size=0.25, random_state=0)

In [26]:
xtrain.shape

(37, 10)

In [27]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [28]:
y_pred2 = model2.predict(xtest)

In [29]:
y_pred2

array([[106465.31654667],
       [127249.18985909],
       [128150.5690433 ],
       [ 71917.04792707],
       [174255.93031912],
       [132972.6286615 ],
       [ 67004.30999541],
       [104627.66885109],
       [125314.44298303],
       [159931.03070504],
       [100017.67874908],
       [ 91337.6804562 ],
       [113069.60333231]])

In [30]:
mse = mean_squared_error(ytest, y_pred2)
print('MSE:', mse)

MSE: 179095765.64467436


In [31]:
model2.score(xtest, ytest)

0.8339849052033463

In [32]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


# Degree 3

In [33]:
poly = PolynomialFeatures(degree=3) # polynomial regression with degree 2 
X_poly_deg3 = poly.fit_transform(x)

In [34]:
X_poly_deg3.shape

(50, 20)

In [35]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly_deg3, y, test_size=0.25, random_state=0)

In [36]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [37]:
model2.score(xtest, ytest) # testing

0.23337017977529306

In [38]:
model2.score(xtrain, ytrain) # training

0.9722436401014305

# eta akta overfit model

# let's talk about Regularization

In [39]:
x = df.drop('Profit', axis=1)
y = df[['Profit']]

In [40]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [41]:
xtrain.head()

Unnamed: 0,Marketing Spend,Administration,Transport
34,46426.07,157693.92,210797.67
18,91749.16,114175.79,294919.57
7,130298.13,145530.06,323876.68
14,119943.24,156547.42,256512.92
45,1000.23,124153.04,1903.93


# L1 Regularization with LR - Lasso

In [42]:
lasso_model = Lasso(alpha=0.1) # Tune alpha parameter
lasso_model.fit(xtrain, ytrain)

# L2 Regularization with LR - Ridge

In [43]:
ridge_model = Ridge(alpha=0.1) # Tune alpha parameter
ridge_model.fit(xtrain, ytrain)

# Performance

In [44]:
lasso_predictions = lasso_model.predict(xtest)
lasso_mse = mean_squared_error(ytest, lasso_predictions)

ridge_predictions = ridge_model.predict(xtest)
ridge_mse = mean_squared_error(ytest, ridge_predictions)

print('Lasso MSE:', lasso_mse)
print('Ridge MSE:', ridge_mse)

Lasso MSE: 109132410.3705798
Ridge MSE: 109132410.44509476


In [45]:
lasso_model.score(xtest, ytest)

0.8988383260327637

In [46]:
ridge_model.score(xtest, ytest)

0.8988383259636911