# Linear Regression

In [1]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
import pandas as pd
from joblib import dump

In [2]:
import sys
sys.path.append('F:\\Users\\Manuel García Plaza\\Desktop\\TFG\\')

In [3]:
from notebooks.utils.regression_metrics import regression

Data loading:

In [4]:
train =  pd.read_parquet('../../../data/model_input/train_sets/superconductivity.parquet')
test =  pd.read_parquet('../../../data/model_input/validation_sets/superconductivity.parquet')

In [5]:
y_train = train.critical_temp
X_train = train.drop(columns=['critical_temp'])

In [6]:
y_test = test.critical_temp
X_test = test.drop(columns=['critical_temp'])

We are going to compute three type of models: Ordinary Least Squares Linear Regression, Ridge ($l_2$ penalty term), Lasso ($l_1$ penalty term) and ElasticNet (convex combination of Ridge and Lasso).

Every parameter modification is only to avoid warnings.

In [7]:
lr = LinearRegression()

In [8]:
lr_l2 = Ridge()

In [9]:
lr_l1 = Lasso(tol=0.001, max_iter=10000)

In [15]:
lr_en1 = ElasticNet(l1_ratio=0.25, max_iter=10000, tol=0.01, selection='random')
lr_en2 = ElasticNet(l1_ratio=0.5, max_iter=10000, tol=0.01, selection='random')
lr_en3 = ElasticNet(l1_ratio=0.75, max_iter=10000, tol=0.01, selection='random')

In [11]:
models_list = [lr, lr_l2, lr_l1, lr_en1, lr_en2 , lr_en3]
names_list = ['LR', 'LR_l2', 'LR_l1', 'LR_en_0.25', 'LR_en_0.5', 'LR_en_0.75']

In [16]:
metrics = regression(models_list, names_list, '../../../data/metrics/superconductivity/linear_regression.parquet', X_train, y_train, X_test, y_test)
metrics

Unnamed: 0,Run_Time,Train_MSE,Test_MSE,delta%
LR,0.38606,266.514128,2776.514944,941.789028
LR_l2,0.081782,266.681377,2753.768157,932.606098
LR_l1,73.983241,313.331631,319.64596,2.015222
LR_en_0.25,98.001067,315.351354,321.826126,2.053193
LR_en_0.5,67.558669,316.634874,323.268556,2.095057
LR_en_0.75,48.429512,316.144793,322.662436,2.061601


The ordinary and the Ridge models are way overfitted. From the other four models we are going to choose **LR_l1** (Lasso) as the best one because it has the lowest MSE in test. 

In [17]:
dump(lr_l1, 'linear_regression_best.joblib')

['linear_regression_best.joblib']