## Logistic Regression [LR]

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score, max_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# getting the dataset
qsar = pd.read_csv('biodeg.csv', sep=';')
qsar = qsar.drop("Unnamed: 42", axis=1)  # removing the unnamed column if you have, no means, comment it out
#USE THIS COMMAND IF THERE IS AN UNAMED COLUMN
qsar.sample(3)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,C-026,F02[C-N],nHDon,SpMax_B(m),Psi_i_A,nN,SM6_B(m),nArCOOR,nX,experimental_class
990,3.618,2.8543,2,0,0,0,0,33.3,1,0,...,0,0,0,3.71,2.744,0,8.384,0,2,NRB
71,5.047,2.7377,0,0,0,0,3,50.0,0,5,...,0,0,1,3.906,3.321,0,8.796,2,0,RB
643,4.77,3.3176,0,0,1,0,2,41.2,1,2,...,1,2,0,3.83,2.933,1,8.497,0,0,NRB


In [3]:
# changing the str label into 1s and 0s
qsar['experimental_class'] = qsar['experimental_class'].astype('category')
encode_map = {
    'RB': 1,
    'NRB': 0
}
qsar['experimental_class'].replace(encode_map, inplace=True)
# 1 reprsents RB and O represents NRB [changed it into a binary classification problem now]

In [4]:
qsar.isnull().values.any()  # checking if there is any missing value in data frame
qsar.isnull().sum().sum()  # sums of missing value in the data frame

0

In [5]:
#setting up the target variable and the feature variable
y = qsar['experimental_class']
x = qsar.drop(columns='experimental_class')

In [6]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)
# print(x_scaler)

In [7]:
### Splitting the dataset before conducting SFS [forward and backward]
seed_num = 80
# creating set for SFS usage, the remaining set is for testing the model
x_sfs, x_not_sfs, y_sfs, y_not_sfs = train_test_split(x_scaler, y, test_size=0.3, random_state=seed_num)

print(x_sfs.shape)
print(x_not_sfs.shape)
print(y_sfs.shape)
print(y_not_sfs.shape)

# will be using x_sfs and y_sfs for SFS training and for training the model itself
# the x_not_sfs and y_not_sfs is not used for the SFS training, it will be used for testing the model

(738, 41)
(317, 41)
(738,)
(317,)


In [8]:
model_logr = LogisticRegression()
fs = 5 #The features that is used for the selection

In [9]:
# feature selection is done here.
# the direction must be set manually by the user
direction = "forward"
print("The feature selection direction:", direction)
print("Feature: ", fs)
sfs = SequentialFeatureSelector(model_logr, direction=direction, n_features_to_select=fs, cv=5)
x_after_sfs = sfs.fit_transform(x_sfs, y_sfs)  # x_after_sfs holds the selected features
selected_feature = x.columns[sfs.get_support(indices=True)]
print(selected_feature)

The feature selection direction: forward
Feature:  5
Index(['nCp', 'nCIR', 'SpMax_A', 'nN', 'nX'], dtype='object')


In [10]:
grid_params = {"C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"]}
gsc = GridSearchCV(model_logr, grid_params, cv=3)
result = gsc.fit(x_after_sfs, y_sfs)

In [11]:
print("Feature: ", fs)
print("The best parameters : " + str(result.best_params_))
# print("The best index : " + str(gsc.best_index_))
# print(gsc.best_estimator_)
print("Training score : " + str(result.best_score_) + "\n")

Feature:  5
The best parameters : {'C': 10.0, 'penalty': 'l2'}
Training score : 0.8224932249322494



In [12]:
# based on sfs
# updating the test set, 'x_test_' according to 'sfs' selection
# assign 'x_not_sfs' to 'x_test_' --> to be used for testing later
x_test_ = x_not_sfs
x_test_ = sfs.transform(x_test_)
y_train = y_sfs
y_test = y_not_sfs

In [13]:
c_val = input("Enter the C value : ")
penalty = input("Enter the penalty : ")
c_val = float(c_val)

Enter the C value : 10.0
Enter the penalty : l2


In [14]:
##LOGISTIC REGRESSION MODEL
m_lr = LogisticRegression(C=1.0, penalty='l2')
m_lr.fit(x_after_sfs, y_train)

LogisticRegression()

In [15]:
y_train_pred = m_lr.predict(x_after_sfs)  # training set ,"x_after_sfs_" is used for prediction
print("Training set ")
print("Mean absolute error : " + str(mean_absolute_error(y_train, y_train_pred)))
print("Mean squared error : " + str(mean_squared_error(y_train, y_train_pred)))
print("r2 score : " + str(r2_score(y_train, y_train_pred)))
print("The max error value : " + str(max_error(y_train, y_train_pred)) + "\n")
print("accuracy score: ", accuracy_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

y_pred = m_lr.predict(x_test_)
print("Testing Set ")
print("Mean absolute error : " + str(mean_absolute_error(y_test, y_pred)))
print("Mean squared error : " + str(mean_squared_error(y_test, y_pred)))
print("r2 score : " + str(r2_score(y_test, y_pred)))
print("The max error value : " + str(max_error(y_test, y_pred)) + "\n")
print("accuracy score: ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training set 
Mean absolute error : 0.17344173441734417
Mean squared error : 0.17344173441734417
r2 score : 0.22418508389385783
The max error value : 1

accuracy score:  0.8265582655826558
[[440  49]
 [ 79 170]]
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       489
           1       0.78      0.68      0.73       249

    accuracy                           0.83       738
   macro avg       0.81      0.79      0.80       738
weighted avg       0.82      0.83      0.82       738

Testing Set 
Mean absolute error : 0.19242902208201892
Mean squared error : 0.19242902208201892
r2 score : 0.13943035157988426
The max error value : 1

accuracy score:  0.807570977917981
[[181  29]
 [ 32  75]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       210
           1       0.72      0.70      0.71       107

    accuracy                           0.81       317
   macro avg       0.79      0.78