### Problem Statement : Build a regression model which will try to predict unemployment within economy.

#### Data Source: http://research.stlouisfed.org/fred2

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

  return f(*args, **kwds)
  return f(*args, **kwds)


### Import Data

In [2]:
df = pd.read_csv('dataset/economics.csv')

In [3]:
df.columns

Index(['date', 'pce', 'pop', 'psavert', 'uempmed', 'unemploy'], dtype='object')

* date : date


* pce - personal consumption expenditure (billions of dollars)


* pop - total population (thousands)


* psavert - personal savings rate


* uempmed - median duration of unemployment in weeks


* unemploy - number of unemployed in thousands <<< Target variable

In [4]:
df.head()

Unnamed: 0,date,pce,pop,psavert,uempmed,unemploy
0,1967-07-01,507.4,198712,12.5,4.5,2944
1,1967-08-01,510.5,198911,12.5,4.7,2945
2,1967-09-01,516.3,199113,11.7,4.6,2958
3,1967-10-01,512.9,199311,12.5,4.9,3143
4,1967-11-01,518.1,199498,12.5,4.7,3066


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574 entries, 0 to 573
Data columns (total 6 columns):
date        574 non-null object
pce         574 non-null float64
pop         574 non-null int64
psavert     574 non-null float64
uempmed     574 non-null float64
unemploy    574 non-null int64
dtypes: float64(3), int64(2), object(1)
memory usage: 27.0+ KB


In [6]:
df.describe()

Unnamed: 0,pce,pop,psavert,uempmed,unemploy
count,574.0,574.0,574.0,574.0,574.0
mean,4843.510453,257189.381533,7.936585,8.610105,7771.557491
std,3579.287206,36730.801593,3.124394,4.108112,2641.960571
min,507.4,198712.0,1.9,4.0,2685.0
25%,1582.225,224896.0,5.5,6.0,6284.0
50%,3953.55,253060.0,7.7,7.5,7494.0
75%,7667.325,290290.75,10.5,9.1,8691.0
max,12161.5,320887.0,17.0,25.2,15352.0


In [9]:
target_col = ['unemploy']

predictors = ['pce', 'pop', 'psavert', 'uempmed']

df[predictors] = df[predictors]/df[predictors].max()

df.describe()

Unnamed: 0,pce,pop,psavert,uempmed,unemploy
count,574.0,574.0,574.0,574.0,574.0
mean,0.398266,0.801495,0.466858,0.341671,7771.557491
std,0.294313,0.114466,0.183788,0.16302,2641.960571
min,0.041722,0.619258,0.111765,0.15873,2685.0
25%,0.130101,0.700857,0.323529,0.238095,6284.0
50%,0.325087,0.788627,0.452941,0.297619,7494.0
75%,0.630459,0.904651,0.617647,0.361111,8691.0
max,1.0,1.0,1.0,1.0,15352.0


#### Create training and test dataset

In [10]:
X = df[predictors].values

y = df[target_col].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)


print(f'X_train : {X_train.shape}\n\nX_test : {X_test.shape}\n\ny_train : {y_train.shape}\n\ny_test : {y_test.shape}')

X_train : (401, 4)

X_test : (173, 4)

y_train : (401, 1)

y_test : (173, 1)


In [13]:
type(X)

numpy.ndarray

### Let's build models to compare impact of regularization

#### Linear Regression

In [14]:
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
pred_train_lr = lm.predict(X_train)

print("Training Accuracy : ")

print(f'RMSE: {np.sqrt(mean_squared_error(pred_train_lr,y_train))}')

print(f'R-Squared : {r2_score(pred_train_lr,y_train)}')

pred_test_lr = lm.predict(X_test)

print("\n\nTesting Accuracy : ")

print(f'RMSE : {np.sqrt(mean_squared_error(pred_test_lr,y_test))}')

print(f'R-Squared : {r2_score(pred_test_lr,y_test)}')

Training Accuracy : 
RMSE: 968.0761455803668
R-Squared : 0.8200876111747164


Testing Accuracy : 
RMSE : 1025.237039194264
R-Squared : 0.8647441266422433


In [23]:
print(f'Linear Model Coefficients : {lm.coef_}')

Linear Model Coefficients : [[-18954.43215899  54899.24860794   4330.46153699  14071.10769273]]


In [18]:
df.describe()

Unnamed: 0,pce,pop,psavert,uempmed,unemploy
count,574.0,574.0,574.0,574.0,574.0
mean,0.398266,0.801495,0.466858,0.341671,7771.557491
std,0.294313,0.114466,0.183788,0.16302,2641.960571
min,0.041722,0.619258,0.111765,0.15873,2685.0
25%,0.130101,0.700857,0.323529,0.238095,6284.0
50%,0.325087,0.788627,0.452941,0.297619,7494.0
75%,0.630459,0.904651,0.617647,0.361111,8691.0
max,1.0,1.0,1.0,1.0,15352.0


In [19]:
df['unemploy'].mean()

7771.557491289199

### Regularization

In Linear regression we try to find the best coefficients which minimize the loss function.

however, if coeff are too large then it leads to overfitting.

**Solution : Penalize large coefficients**

1) Ridge


2) Lasso


3) Elasticnet

### Ridge


In linear regression, Loss function = RMSE, OLS

**Loss Function = OLS + alpha * (squared sum of coefficient values)**

* Alpha - Penalty parameter - you'll have to select it


* Low val of alpha >>> Overfitting


* High val of alpha >>> Underfitting

In [20]:
rr = Ridge(alpha=0.1)

rr.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [21]:
pred_train_lr = rr.predict(X_train)

print("Training Accuracy : ")

print(f'RMSE: {np.sqrt(mean_squared_error(pred_train_lr,y_train))}')

print(f'R-Squared : {r2_score(pred_train_lr,y_train)}')

pred_test_lr = rr.predict(X_test)

print("\n\nTesting Accuracy : ")

print(f'RMSE : {np.sqrt(mean_squared_error(pred_test_lr,y_test))}')

print(f'R-Squared : {r2_score(pred_test_lr,y_test)}')

Training Accuracy : 
RMSE: 1070.4626292601777
R-Squared : 0.7483000286050683


Testing Accuracy : 
RMSE : 1135.119045893667
R-Squared : 0.8174472445632814


In [25]:
# Linear Model Coefficients : 
# [[-18954.43215899  54899.24860794   4330.46153699  14071.10769273]]
# Ridge Model Coefficients :
# [[-9011.68158244 24729.64577439  1289.99885498 14256.58207005]]]

print(f'Ridge Model Coefficients : {rr.coef_}')

Ridge Model Coefficients : [[-9011.68158244 24729.64577439  1289.99885498 14256.58207005]]


### Lasso : Least absolute shrinkage and selection operator

**Loss Function = OLS + alpha * (summation of absolute values of magnitude of coefficients)**

In [35]:
lr  = Lasso(alpha=0.01)
lr.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [36]:
pred_train_lr = lr.predict(X_train)

print("Training Accuracy : ")

print(f'RMSE: {np.sqrt(mean_squared_error(pred_train_lr,y_train))}')

print(f'R-Squared : {r2_score(pred_train_lr,y_train)}')

pred_test_lr = lr.predict(X_test)

print("\n\nTesting Accuracy : ")

print(f'RMSE : {np.sqrt(mean_squared_error(pred_test_lr,y_test))}')

print(f'R-Squared : {r2_score(pred_test_lr,y_test)}')

Training Accuracy : 
RMSE: 968.0766125480882
R-Squared : 0.8200232407851629


Testing Accuracy : 
RMSE : 1025.2800928722208
R-Squared : 0.8646988158959943


In [37]:
# Linear Model Coefficients : 
# [[-18954.43215899  54899.24860794   4330.46153699  14071.10769273]]

# Ridge Model Coefficients :
# [[-9011.68158244 24729.64577439  1289.99885498 14256.58207005]]]

# Lasso Model Coefficients : 
# [-18750.56797485  54273.81719396   4261.3743402   14074.18054241]

print(f'Lasso Model Coefficients : {lr.coef_}')

Lasso Model Coefficients : [-18934.0097197   54836.61414778   4323.55004444  14071.41080403]


### Elasticnet - Combination of both Lasso and Ridge

In [38]:
er = ElasticNet(alpha=0.01)
er.fit(X_train,y_train)

ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [39]:
pred_train_lr = er.predict(X_train)

print("Training Accuracy : ")

print(f'RMSE: {np.sqrt(mean_squared_error(pred_train_lr,y_train))}')

print(f'R-Squared : {r2_score(pred_train_lr,y_train)}')

pred_test_lr = er.predict(X_test)

print("\n\nTesting Accuracy : ")

print(f'RMSE : {np.sqrt(mean_squared_error(pred_test_lr,y_test))}')

print(f'R-Squared : {r2_score(pred_test_lr,y_test)}')

Training Accuracy : 
RMSE: 1341.57189779199
R-Squared : 0.4555544973361728


Testing Accuracy : 
RMSE : 1464.1035435648603
R-Squared : 0.5629771795350077


In [40]:
# Linear Model Coefficients : 
# [[-18954.43215899  54899.24860794   4330.46153699  14071.10769273]]

# Ridge Model Coefficients :
# [[-9011.68158244 24729.64577439  1289.99885498 14256.58207005]]]

# Lasso Model Coefficients : 
# [-18750.56797485  54273.81719396   4261.3743402   14074.18054241]


# Elasticnet Model Coefficients : 
# [ 991.1097095  2804.06079157 1174.10663612 9686.27859074]


print(f'Elasticnet Model Coefficients : {er.coef_}')

Elasticnet Model Coefficients : [ 991.1097095  2804.06079157 1174.10663612 9686.27859074]


### Selection of Coefficients using cross validation

In [41]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

In [42]:
rrcv = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,1])
rrcv.fit(X_train,y_train)

RidgeCV(alphas=array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00]), cv=None,
        fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [43]:
pred_train_lr = rrcv.predict(X_train)

print("Training Accuracy : ")

print(f'RMSE: {np.sqrt(mean_squared_error(pred_train_lr,y_train))}')

print(f'R-Squared : {r2_score(pred_train_lr,y_train)}')

pred_test_lr = rrcv.predict(X_test)

print("\n\nTesting Accuracy : ")

print(f'RMSE : {np.sqrt(mean_squared_error(pred_test_lr,y_test))}')

print(f'R-Squared : {r2_score(pred_test_lr,y_test)}')

Training Accuracy : 
RMSE: 968.0766757718355
R-Squared : 0.8200256582455017


Testing Accuracy : 
RMSE : 1025.2847617742148
R-Squared : 0.8647013666508987


In [44]:
print(f'Ridge Cross Validation Model Coefficients : {rrcv.coef_}')

Ridge Cross Validation Model Coefficients : [[-18932.79922413  54831.93312551   4323.1503764   14072.23225811]]


In [45]:
print(f'Best Value of alpha : {rrcv.alpha_}')

Best Value of alpha : 0.0001


# Great Job !