## Perceptron Model

In [1]:
import os
import sys
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error
from sklearn.feature_selection import SequentialFeatureSelector
import warnings
from sklearn.linear_model import SGDClassifier
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses
warnings.filterwarnings('ignore')

In [2]:
# getting the dataset
qsar = pd.read_csv('biodeg.csv', sep=';')
qsar = qsar.drop("Unnamed: 42", axis=1)  # #USE THIS COMMAND IF THERE IS AN UNAMED COLUMN
qsar.sample(3)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,C-026,F02[C-N],nHDon,SpMax_B(m),Psi_i_A,nN,SM6_B(m),nArCOOR,nX,experimental_class
80,5.814,2.4937,0,0,0,2,0,37.0,3,1,...,0,0,0,3.756,2.091,0,8.196,0,0,RB
823,4.807,2.7734,2,0,0,0,0,26.3,0,0,...,0,0,0,4.094,2.019,2,8.76,0,0,NRB
956,5.026,2.9255,0,0,0,0,3,50.0,0,1,...,1,0,0,3.88,2.042,0,8.525,0,0,NRB


In [3]:
# changing the str label into 1s and 0s
qsar['experimental_class'] = qsar['experimental_class'].astype('category')
encode_map = {
    'RB': 1,
    'NRB': 0
}
qsar['experimental_class'].replace(encode_map, inplace=True)
# 1 reprsents RB and O represents NRB [changed it into a binary classification problem now]

In [4]:
qsar.isnull().values.any()  # checking if there is any missing value in data frame
qsar.isnull().sum().sum()  # sums of missing value in the data frame

0

In [5]:
# setting up the target variable and the feature variable
y = qsar['experimental_class']
x = qsar.drop(columns='experimental_class')

In [6]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)
# print(x_scaler)

In [7]:
### Splitting the dataset before conducting SFS [forward and backward]
seed_num = 80
# creating set for SFS usage, the remaining set is for testing the model
x_sfs, x_not_sfs, y_sfs, y_not_sfs = train_test_split(x_scaler, y, test_size=0.3, random_state=seed_num)

print(x_sfs.shape)
print(x_not_sfs.shape)
print(y_sfs.shape)
print(y_not_sfs.shape)

# will be using x_sfs and y_sfs for SFS training and for training the model itself
# the x_not_sfs and y_not_sfs is not used for the SFS training, it will be used for testing the model

(738, 41)
(317, 41)
(738,)
(317,)


In [8]:
lasso = LassoCV().fit(x_sfs, y_sfs)
importance = np.abs(lasso.coef_)
feature_names = np.array(x)

In [9]:
# Perceptron initialization
model_pcpt = SGDClassifier()
# INITIALISE THE FEATURE NUMBERS
fs = 5
direction = "forward"

In [10]:
# feature selection is done here.
# the direction must be set manually by the user
print("Feature selection mode :", direction)
print("Number of features :", fs)
sfs = SequentialFeatureSelector(lasso, direction=direction, n_features_to_select=fs, cv=5)
x_after_sfs = sfs.fit_transform(x_sfs, y_sfs)  # x_after_sfs holds the selected features
selected_feature = x.columns[sfs.get_support(indices=True)]
print(selected_feature)

Feature selection mode : forward
Number of features : 5
Index(['nO', 'F03[C-O]', 'SpMax_A', 'nN', 'SM6_B(m)'], dtype='object')


In [11]:
grid_params = {
    "loss": ["hinge", "log", "squared_hinge", "modified_huber"],
    "alpha": [0.0001, 0.001, 0.01, 0.1],
    "penalty": ["l2", "l1", "none"],
}
gsc = GridSearchCV(model_pcpt, grid_params, cv=3, n_jobs=-1)
result = gsc.fit(x_after_sfs, y_sfs)

In [12]:
print("feature Selection: ", fs)
print("The best parameters : " + str(result.best_params_))
# print("The best index : " + str(gsc.best_index_))
# print(gsc.best_estimator_)
print("Training score : " + str(result.best_score_) + "\n")

feature Selection:  5
The best parameters : {'alpha': 0.001, 'loss': 'modified_huber', 'penalty': 'none'}
Training score : 0.8021680216802167



In [13]:
# based on sfs
# updating the test set, 'x_test_' according to 'sfs' selection
# assign 'x_not_sfs' to 'x_test_' --> to be used for testing later
x_test_ = x_not_sfs
x_test_ = sfs.transform(x_test_)
y_train = y_sfs
y_test = y_not_sfs

In [15]:
##Perceptron model
alpha = input("Enter the alpha : ")
loss = input("Enter the loss : ")
penalty = input("Enter the penalty : ")
alpha = float(alpha)

Enter the alpha : 0.001
Enter the loss : modified_huber
Enter the penalty : none


In [16]:
m_prcpt = SGDClassifier(alpha=alpha, loss=loss, penalty=penalty)
m_prcpt.fit(x_after_sfs, y_train)
y_train_pred = m_prcpt.predict(x_after_sfs)
print("Training set ")
print("Mean absolute error : " + str(mean_absolute_error(y_train, y_train_pred)))
print("Mean squared error : " + str(mean_squared_error(y_train, y_train_pred)))
print("r2 score : " + str(r2_score(y_train, y_train_pred)))
print("The max error value : " + str(max_error(y_train, y_train_pred)) + "\n")
print("accuracy score: ", accuracy_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

y_pred = m_prcpt.predict(x_test_)  # testing set ,"x_test_" is used for prediction
print("Testing set ")
print("Mean absolute error : " + str(mean_absolute_error(y_test, y_pred)))
print("Mean squared error : " + str(mean_squared_error(y_test, y_pred)))
print("r2 score : " + str(r2_score(y_test, y_pred)))
print("The max error value : " + str(max_error(y_test, y_pred)) + "\n")
print("accuracy score: ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training set 
Mean absolute error : 0.23441734417344173
Mean squared error : 0.23441734417344173
r2 score : -0.048562347549707674
The max error value : 1

accuracy score:  0.7655826558265583
[[386 103]
 [ 70 179]]
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       489
           1       0.63      0.72      0.67       249

    accuracy                           0.77       738
   macro avg       0.74      0.75      0.75       738
weighted avg       0.78      0.77      0.77       738

Testing set 
Mean absolute error : 0.2334384858044164
Mean squared error : 0.2334384858044164
r2 score : -0.04396973742768129
The max error value : 1

accuracy score:  0.7665615141955836
[[163  47]
 [ 27  80]]
              precision    recall  f1-score   support

           0       0.86      0.78      0.81       210
           1       0.63      0.75      0.68       107

    accuracy                           0.77       317
   macro avg       0.74      0.