## Import all necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, precision_recall_curve, average_precision_score, recall_score
from sklearn.decomposition import PCA
%matplotlib inline

## Load dataset from csv file

In [None]:
dataset = pd.read_csv('training_set.csv')
dataset.head()

In [None]:
# Dataset contains one unnamed column and it should be removed
dataset = dataset.iloc[:, 1:]
dataset.head()

## Perform EDA

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset.hist()

In [None]:
corr = dataset.corr()
plt.figure(figsize=(200, 200))
sns.heatmap(corr, annot=True)

## Separate features and classes

In [None]:
features = dataset.iloc[:, :-1]
classes = dataset.iloc[:, -1]
# features.columns
features.shape

## Feature selection

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=0.5, penalty='l1', solver='liblinear'))
sel_.fit(features.values, classes.values)
unwanted_features = features.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
unwanted_features

## Remove unwanted features from dataset

In [None]:
refined_features = features.drop(unwanted_features, axis=1, inplace=False)
refined_features.shape

## Standard scaler

In [None]:
sc = StandardScaler()

## K fold cross validation of model

In [None]:
train_score = []
validation_score = []
k_vals = []
for i in range(0, 20):
    k_vals.append(i)
    X_train, X_validate, y_train, y_validate = train_test_split(refined_features, classes.values, test_size = 0.20, random_state = i)
    
    X_train = sc.fit_transform(X_train)
    X_validate = sc.transform(X_validate)
    
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    
    tr_score = classifier.score(X_train, y_train)
    train_score.append(tr_score)
    
    val_score = classifier.score(X_validate, y_validate)
    validation_score.append(val_score)
    y_pred = classifier.predict(X_validate)

In [None]:

plt.figure(figsize=(10,5))
plt.xlabel('Different Values of K')
plt.ylabel('Model score')
plt.plot(k_vals, train_score, color = 'r', label = "training score")
plt.plot(k_vals, validation_score, color = 'b', label = 'validation score')
plt.legend(bbox_to_anchor=(1, 1),
           bbox_transform=plt.gcf().transFigure)

## Classification performance metrics
1. Accuracy

In [None]:
y_pred = classifier.predict(X_validate)
accuracy_score(y_validate, y_pred)

2. Confusion matrix

In [None]:
confusion_matrix = confusion_matrix(y_validate, y_pred)
confusion_matrix


3. Precision Recall curve

In [None]:
print(f"Average precision score is = {average_precision_score(y_validate, y_pred)}")
print(f"Average Recall score is = {recall_score(y_validate, y_pred)}")
precision, recall, thresholds = precision_recall_curve(y_validate, y_pred)
plt.plot(recall, precision, label='Logistic Regression')
plt.xlabel("Recall")
plt.ylabel("Precision")

4. Receiver Operating Characteristic curve (ROC)

In [None]:
fpr, tpr, threshold = roc_curve(y_validate, y_pred)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel("FPR")
plt.ylabel("TPR")

## Prediction on test dataset

In [None]:
test_dataset = pd.read_csv('test_set.csv')
test_features = test_dataset.iloc[:, 1:]
test_features.shape

In [None]:
refined_test_features = test_features.drop(unwanted_features, axis=1, inplace=False)
refined_test_features.shape

test_features_sc = sc.transform(refined_test_features.values)

y_predicted = classifier.predict(test_features_sc)
# y_predicted
test_dataset["Y"] = y_predicted
test_dataset.to_csv("output.csv")
print(test_dataset.head())