In [80]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import necessary libraries**

In [295]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split

**Load the datasets**

In [296]:
train = pd.read_csv('../input/train-dataset/train.csv')
test = pd.read_csv('../input/test-dataset/test.csv')

In [297]:
train.head()

In [298]:
test.head()

In [299]:
train.info()

In [300]:
train.shape

In [302]:
train.describe()

**Cleaning and preprocessing**

In [303]:
#Check the Null Values for train data
train.isnull().values.any()

In [304]:
#check the Null Values for test data
test.isnull().values.any()

In [305]:
#Total Null Values for train
train.isnull().values.sum()

In [306]:
#Showing the null values as per attributes for train
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

In [307]:
#Showing the null values as per attributes for test
null_columns=test.columns[test.isnull().any()]
test[null_columns].isnull().sum()

In [308]:
print(train[train.isnull().any(axis=1)][null_columns].head())

In [309]:
#drop the columns which contain null values for train
train.drop(columns=['s53','s54','s55','s56','s57','s59','s69'], axis=1, inplace=True)

In [310]:
#drop the columns which contain null values for test
test.drop(columns=['s53','s54','s55','s56','s57','s59','s69'], axis=1, inplace=True)

**Handling Unwanted Values**

In [311]:
#for train dataset
train['s52'] = train['s52'].replace(['l','o'],['1','0'])

In [312]:
#for test dataset
test['s52'] = test['s52'].replace(['l','o'],['1','0'])

**Type conversion s52 column object to int**

In [313]:
#for train dataset
train['s52'] = train['s52'].astype(int)

In [314]:
#for test dataset
test['s52'] = test['s52'].astype(int)

#### Separate categorical and numerical variables

In [315]:
cat_val = []
num_val = []

for column in train.columns:
    if len(train[column].unique()) <= 28:
        cat_val.append(column)
    else:
        num_val.append(column)

In [316]:
#Categorical Variables
cat_val

In [317]:
#Numerical Variables
num_val

In [349]:
test_cat_val = []
test_num_val = []

for column in test.columns:
    if len(test[column].unique()) <= 28:
        test_cat_val.append(column)
    else:
        test_num_val.append(column)

#### Mapping the gender, s11, s12, s58 columns.We have mapped M to 1 and F to 0 of the gender column and Y to 1 and N to 0 of the s11, s12 columns and B to 1 and A to 0 of the s58 column.

In [318]:
#For train dataset
train['gender'] = train['gender'].map({'M':1, 'F':0})
train['s11'] = train['s11'].map({'Y':1, 'N':0})
train['s12'] = train['s12'].map({'Y':1, 'N':0})
train['s58'] = train['s58'].map({'B':1, 'A':0})

In [319]:
#For test dataset
test['gender'] = test['gender'].map({'M':1, 'F':0})
test['s11'] = test['s11'].map({'Y':1, 'N':0})
test['s12'] = test['s12'].map({'Y':1, 'N':0})
test['s58'] = test['s58'].map({'B':1, 'A':0})

#### Level Encoding

In [320]:
#for train dataset
from sklearn.preprocessing import LabelEncoder
cols=['s16','s17','s18','s70','s71']
le=LabelEncoder()
for col in cols:
    train[col]=le.fit_transform(train[col])

In [336]:
#for test dataset
cols=['s16','s17','s18','s70','s71']
le=LabelEncoder()
for col in cols:
    test[col]=le.fit_transform(test[col])

#### Check the correlation

In [322]:
train.corr()

#### Heatmap

In [324]:
correlation = train.corr()
plt.figure(figsize=(16, 8))
sns.heatmap(correlation, annot=True, linewidths=0, vmin=-1, cmap="RdBu_r")
plt.show()

#### Box Plot of Full Dataset

In [325]:
plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=train, orient="h", palette="Set2")

In [326]:
ax = sns.boxplot(x=train["n14"])

#### Histogram

In [175]:
train.hist()

#### pairplot

In [176]:
sns.pairplot(train)

### Model Fit Using Logistic Regression and Support vector mechine Classifier 
#### Set feature and target Variables

In [352]:
X = train[['gender','s11','s12','s13','s16','s17','s18','s48','s52','s58','s70','s71','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15']]
#X=train.drop(columns=['id','label'],axis=1)
y = train['label']

In [353]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

### Create function for printing prediction scores for both train and test dataset

In [354]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_____________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_____________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_____________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_____________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

### Model fit and Predict with Logistic Regression

In [355]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)
y_train_pred= lr_clf.predict(X_train)
y_test_pred = lr_clf.predict(X_test)

### Model fit and predict with SVM

In [367]:
from sklearn.svm import SVC


svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0, probability=True)
svm_clf.fit(X_train, y_train)

print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

y_train_pred= svm_clf.predict(X_train)
y_test_pred = svm_clf.predict(X_test)

## AUC ROC Curve and Calculate AUC

In [373]:
y_pred_proba = svm_clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

### Predicting the label column for test dataset

In [357]:
df=test.copy()

In [358]:
df = df.drop('id',axis=1)

In [360]:
test_svm=svm_clf.predict(df)
test_svm

#### id and predicted label for the submission.

In [361]:
submission = pd.DataFrame(test['id'],columns=['id'])

In [362]:
submission['label'] = test_svm

In [363]:
submission.sample(10)

In [374]:
submission.to_csv('submission_Anonymous Five_2u27pl.csv')