# **Predict Client Subscriptions to Bank Marketing Campaigns**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Import packages

In [None]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import copy
import random
import ppscore as pps
import matplotlib.pyplot as plt
from array import array
from scipy.stats import chisquare
from scipy.stats import chi2_contingency 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tabulate import tabulate
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score,confusion_matrix
from imblearn.under_sampling import InstanceHardnessThreshold
from itertools import compress
import pickle

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
np.seterr(divide = 'ignore')
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings('ignore', category=PendingDeprecationWarning)  

In [None]:
sys.path.append('/content/drive/MyDrive/Colab Notebooks')
from SL11 import Trivial_Model,Baseline_Model,Logisitic_Regression,Random_Forest,Support_Vector_Classifier,Multi_Layer_Perceptron,Decision_Tree


In [None]:
from SSL import Prop_1NN,Expectation_Maximization,SSL_Log_Reg,Label_Propagation
from qns3vm import QN_S3VM

### Read data

In [None]:
data=pd.read_csv("/content/drive/MyDrive/EE_660_Project/bank-additional/bank-additional-full.csv", sep = ';')

In [None]:
df=data.copy()

In [None]:
df.columns

In [None]:
df.drop(columns=['duration'],inplace=True)

In [None]:
df.describe()

In [None]:
df.info()

# **Exploratory Data Analysis**

In [None]:
def generate_groupwise_probability(data,feature,label):
    '''
    Function to look at the group wise distribution for each feature with respect to the target label 
    and also look at the percentage wise data distribution to the whole data
    
    Parameters:
    data- dataset
    feature- feature for which we want groupwise distribution
    label- target label

    Return:
    d_final- dataset showing group wise distribution
    '''
    d1=pd.crosstab(data[feature], data[label], dropna=False,margins=True)
    d2 = (df.groupby(feature,sort=False)[label].count()/len(df)).round(2)
    d3=pd.crosstab(data[feature], data[label], dropna=False, normalize='index').round(2)
    d=pd.merge(d1, d2,left_index=True, right_index=True)
    d.rename(columns={'All': 'Count_total', 'no': 'Count_no', 'yes': 'Count_yes','y':'Prob_total'}, inplace=True)
    d_final=pd.merge(d, d3, on=feature)
    d_final.rename(columns={'no': 'Prob_no', 'yes': 'Prob_yes'}, inplace=True)
    d_final=d_final[['Count_no', 'Prob_no', 'Count_yes', 'Prob_yes','Count_total', 'Prob_total']]
    return d_final

In [None]:

def chi_func(df,feat,label):
    '''
    Function to find chi-square test value of independence of variables in a contingency table
    
    Parameters:
    df- dataframe
    feature- feature
    label- target label 

    Return:
    chi2, p-value
    '''
    cross_tab_table=pd.crosstab(df[feat], df[label], margins=True)
    chi2_stat, p, dof, expected =chi2_contingency(cross_tab_table)
    print(f"chi2 statistic:     {chi2_stat:.3g}")
    print(f"p-value:            {p:.3g}")

### Age

In [None]:
grouped_df=df.groupby(['y'])
plt.figure(figsize=(12,8))
plt.suptitle('Count vs Age', fontsize=20)
#plot 1:
plt.subplot(2, 1, 1)
plt.hist(grouped_df.get_group('yes')['age'],linewidth=1.2,bins=np.arange(10,100,5),color='plum')
plt.axvline(30, color='k', linestyle='dashed', linewidth=1.5)
plt.axvline(60, color='k', linestyle='dashed', linewidth=1.5)

plt.title('y="yes"', fontsize=15,fontweight="bold")
plt.xlabel('Age',fontweight="bold")
plt.ylabel('Frequency',fontweight="bold")
plt.tight_layout(pad=5.0)
#plot 2:
plt.subplot(2, 1, 2)
plt.hist(grouped_df.get_group('no')['age'], linewidth=1.2,bins=np.arange(10,100,5),color='plum')
plt.axvline(30, color='k', linestyle='dashed', linewidth=1.5)
plt.axvline(60, color='k', linestyle='dashed', linewidth=1.5)

plt.title('y="no"', fontsize=15,fontweight="bold")
plt.xlabel('Age',fontweight="bold")
plt.ylabel('Frequency',fontweight="bold")
plt.show()

The plot above shows that there are 3 broad age categories that influence the way clients subscribe to a term deposit. The categories are as follows:

i) Below 30 years old (Young): Between the age of 20 and 30 clients begin to take subscriptions.


ii) 30 years old to 60 years old (Middle aged): More unwilling to take deposits than other categories. A lot of clients take subscriptions around the age of 30.

iii)Above 60 years old (Retired):Most of the retired clients end up taking subscriptions




### *Jobs*

In [None]:
generate_groupwise_probability(df,'job','y')

A majority of clients with the job of admin end up subscribing to a term deposit. In the case of jobs like blue-collar, entrepreneur and services very few people end up subscribing. A lot of students are observed to end up subscribing.

### *Marital*

In [None]:
generate_groupwise_probability(df,'marital','y')

Single people are observed to subscribe to a term deposit more often than married or divorced.



### *Education*

In [None]:
df_education_yes=df[df['y']=='yes'].groupby('education')['y'].count()
df_education_no=df[df['y']=='no'].groupby('education')['y'].count()

In [None]:
plt.figure(figsize=(12,8))
r = np.arange(8)
width = 0.25
plt.bar(r,df_education_yes, 0.25, color='green', align='center',label='y=yes')
plt.bar(r+width,df_education_no, 0.25, color='orange', align='center',label='y=no')
plt.xlabel("Education",fontweight="bold")
plt.ylabel("Count",fontweight="bold")
plt.title("Number of people who subscribed vs. who didnot as per Education", fontsize=15,fontweight="bold")
plt.xticks(r + width/2,['basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown'],rotation=60)
plt.legend()
plt.show()

The group illiterate has few points only so we can drop it off.

In [None]:
# delete all rows with column 'Age' has value 30 to 40
index_education= df[ (df['education'] =='illiterate') ].index
df.drop(index_education , inplace=True)

### *Default*

In [None]:
generate_groupwise_probability(df,'default','y')

We observe that there are only 3 for default='yes' so we can drop this feature as the remaining group is just 'no' and there is not much information about the minority group 'yes'.

In [None]:
df.drop(columns=['default'],inplace=True)

### *Loan*

In [None]:
generate_groupwise_probability(df,'loan','y')

In [None]:
chi_func(df,'loan','y')

### *Housing*

In [None]:
generate_groupwise_probability(df,'housing','y')

In [None]:
chi_func(df,'housing','y')

In [None]:
df.drop(columns=['housing'],inplace=True)

### *Contact*

In [None]:
generate_groupwise_probability(df,'contact','y')

In [None]:
contact_counts=(df[df['y']=='yes'].contact.value_counts())

In [None]:
plt.figure(figsize=(8,6))
plt.bar(contact_counts.keys(),contact_counts.values,color='palevioletred')
plt.title('Number of contacts that subscribed', fontsize=15,fontweight="bold")
plt.xlabel('Contact',fontweight="bold")
plt.ylabel('Frequency',fontweight="bold")
plt.show()

From the plot we can see that people with contact as cellular subscribed more as compared to the people who had telephone as their contact source.

### *Month*

In [None]:
generate_groupwise_probability(df,'month','y')

In [None]:
plt.figure(figsize=(12,8))
month_wise_data=df[df['y']=='yes'].month.value_counts()
plt.pie(month_wise_data, labels = month_wise_data.index.tolist(),colors =sns.color_palette('pastel'), autopct='%.2f%%')
plt.title('Pie chart showing the percentage of subscribers for every month', fontsize=15,fontweight="bold")
plt.show()
plt.show() 


The above pie chart depicts that in the month of May most clients had subscribed. This was closely followed by the month of July and August. It shows that contacting clients in these months would be profitable.

### *Day of week*


In [None]:
generate_groupwise_probability(df,'day_of_week','y')

In [None]:
chi_func(df,'day_of_week','y')

**The** p value seems to be statistically insignificant,thus we can drop this feature.

In [None]:
df.drop(columns=['day_of_week'],inplace=True)

### *pdays* 

For pdays, 999 means client was not previously contacted, so we convert all the pdays less than 999 as 1 and rest as 0. 

In [None]:
df.loc[df["pdays"] < 999, "pdays"] = 1
df.loc[df["pdays"] == 999, "pdays"] = 0

In [None]:
generate_groupwise_probability(df,'pdays','y')

### *Previous*

In [None]:
df.loc[df["previous"] >= 2, "previous"] =2

In [None]:
generate_groupwise_probability(df,'previous','y')

As previous indicates number of contacts performed before the current campaign and for the current client, so we convert all previous contacts more than 2 to 2.Such clients who were contacted more than twice in the previous campign are more likely to subscribe to a term deposit.

### *Poutcome*

In [None]:
generate_groupwise_probability(df,'poutcome','y')

As poutcome indicates success or failure in the previous campaign, it tells us that success in the previous campaign means these clients are likely to subscribe again.



## euribor3m

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

Since euribor3m is strongly correlated with emp.var.rate and nr.employed we will remove this feature from future analysis.



In [None]:
df.drop(columns=['euribor3m'],inplace=True)

In [None]:
# df[(df['marital']=='unknown')&(df['job']=='unknown')(df['education']=='unknown')&(df['loan']=='unknown')].dropna()

# Pairplots for numeric features


In [None]:
sns.pairplot(df.loc[:,['age', 'campaign', 'emp.var.rate','cons.price.idx', 'cons.conf.idx', 'nr.employed','y']],hue="y", palette='husl')#, diag_kind='hist'
plt.show()

# **Split for Train and Test**

In [None]:
def split_train_test_data(X,y,train_data_size):
    '''
    Function to split train and test data in a stratified way
    
    Parameters:
    X- features
    y- target label
    train_data_size- percentage of train data needed

    Return:
    Split of train and test data for both features and target label
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_data_size, 
                                                        stratify=y,random_state=42, shuffle=True)
    return X_train, X_test, y_train, y_test

In [None]:
X=df.loc[:, df.columns != 'y']
y=df.loc[:,'y']
X_train, X_test, y_train, y_test=split_train_test_data(X,y,train_data_size=0.80)

In [None]:
X_train.head()

## Predictive Power Score for features

In [None]:
def generate_ppscore(df):
    '''
    Function to generate predictive power score
    
    Parameters:
    df- dataframe

    Return:
    print predicitve power score wrt target label

    '''
    table_header=["Feature","Predictive Power Score"]
    score_lst=[]
    pps_score=[]
    keys = ['x', 'ppscore']
    columns=df.columns.to_list()
    columns.remove('y')
    for c in columns:
        score_dict=pps.score(df,c, "y")
        values = [score_dict[key] for key in keys]
        score_lst.append(values)
    for x,y in score_lst:
        pps_score.append([x,round(y,2)])
    print(tabulate(pps_score,headers=table_header,tablefmt='grid'))

In [None]:
generate_ppscore(df)

# **Data Imputation**

Data Imputation is a method to replace missing data present in the dataset by replacing it. We will use a machine learning algorithm-KNN to perform data imputation.

In [None]:
numeric_features=['age', 'campaign', 'emp.var.rate','cons.price.idx', 'cons.conf.idx', 'nr.employed']
categoric_features=['job', 'marital','education','loan','contact',
                    'month', 'pdays', 'previous', 'poutcome']

### KNN

In [None]:
def knn_data_imputation_train(df,feat_list,target):
    '''
    Function to find the unknowns in the dataset
    
    Parameters:
    df- dataframe
    feat_list- list of features to use for finding the 'unknowns' in the target feature
    target- target feature

    Return:
    best_knn- trained knn model
    df-updated dataframe after predicting the unknown for the target variable

    '''
    features_list=feat_list.copy()
    features_list.append(target)
    sub_df=df[features_list]
    train=sub_df.loc[(df[target] != 'unknown')]
    test=sub_df.loc[(df[target] == 'unknown')]
    X=train.loc[:, train.columns != target]
    y=train[[target]]

    X_test=test.loc[:, test.columns != target]
    y_test=test[[target]]
    error_rate = []
    encoder = OrdinalEncoder()
    y=encoder.fit_transform(y)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, stratify=y,random_state=42, shuffle=True)
    indices=y_test.index
    for i in range(1,20):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        pred_y =knn.predict(X_val)
        error_rate.append(np.mean(pred_y != y_val))
    min_err=min(error_rate)
    optimal_k=error_rate.index(min_err)
    print("Minimum error:",min_err.round(2),"at K =",optimal_k)
    best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
    best_knn.fit(X,encoder.inverse_transform(y))
    y_hat = best_knn.predict(X_test)
    updated_test=test.copy()
    df.loc[indices,target] = y_hat
    return best_knn,df

In [None]:
def predict_data_impute(model,X_test,target,feat_list):
    '''
    Function to find the unknowns in the test set by using the trained model
    
    Parameters:
    model- trained model
    X_test- dataframe
    target- target feature
    feat_list- list of features to use for finding the 'unknowns' in the target feature

    Return:
    X_test-updated dataframe after predicting the unknown for the target variable
    '''
    features_list=feat_list.copy()
    features_list.append(target)
    sub_df=X_test[features_list]
    testing_set=sub_df.loc[(X_test[target] == 'unknown')]
    indices=testing_set.index
    y_hat = model.predict(testing_set.loc[:, testing_set.columns != target])
    X_test.loc[indices,target] = y_hat
    return X_test

In [None]:
# predict unknown category for marital
impute_knn_model,X_train_final=knn_data_imputation_train(X_train,numeric_features,'marital')
X_test_final=predict_data_impute(impute_knn_model,X_test,'marital',numeric_features)

In [None]:
# predict unknown category for job
impute_knn_model,X_train_final=knn_data_imputation_train(X_train,numeric_features,'job')
X_test_final=predict_data_impute(impute_knn_model,X_test,'job',numeric_features)

In [None]:
# predict unknown category for education
impute_knn_model,X_train_final=knn_data_imputation_train(X_train,numeric_features,'education')
X_test_final=predict_data_impute(impute_knn_model,X_test,'education',numeric_features)

In [None]:
# predict unknown category for loan
impute_knn_model,X_train_final=knn_data_imputation_train(X_train,numeric_features,'loan')
X_test_final=predict_data_impute(impute_knn_model,X_test,'loan',numeric_features)

In [None]:
X_train_final.marital.unique()

In [None]:
X_test_final.head()

In [None]:
y_train.head()

# **Encoding categorical variables**

###### Month

In [None]:
def month_encoding(df):
    '''
    Function to encode the months
    
    Parameters:
    df- dataframe

    Return:
    df-updated dataframe after encoding months
    '''
    df['month'] = df['month'].str.capitalize()
    df['month'] = pd.to_datetime(df.month, format='%b').dt.month.astype(int)
    return df

In [None]:
X_train_final=month_encoding(X_train_final)
X_test_final=month_encoding(X_test_final)

###### Marital, Job, Education and Poutcome

In [None]:
def frequency_encoding(dataf,feature,test_df):
    '''
    Function to frequency encode a feature
    
    Parameters:
    dataf- dataframe
    feature- feature to frequency encode
    test_df- test df 

    Return:
    updated_df- updated train dataframe after frequency encoding the feature column
    testing_df- updated test dataframe after frequency encoding the feature column
    '''
    encoded_series = ((dataf.groupby(feature).size()) / len(dataf)).round(4)
    updated_df=dataf.copy()
    updated_df[feature] = updated_df[feature].apply(lambda x : encoded_series[x])
    testing_df=test_df.copy()
    testing_df[feature] = testing_df[feature].apply(lambda x : encoded_series[x])
    return updated_df,testing_df

In [None]:
X_train_final,X_test_final=frequency_encoding(X_train_final,'marital',X_test_final)
X_train_final,X_test_final=frequency_encoding(X_train_final,'job',X_test_final)
X_train_final,X_test_final=frequency_encoding(X_train_final,'education',X_test_final)
X_train_final,X_test_final=frequency_encoding(X_train_final,'poutcome',X_test_final)

In [None]:
X_train_final.head()

In [None]:
def mapping(dataf,feature,map_dict):
    '''
    Function to map the labels of a feature
    
    Parameters:
    dataf- dataframe
    feature- feature to frequency encode
    map_dict- dictionary to be used for mapping

    Return:
    dataf- updated dataframe after mapping labels
    '''
    dataf[feature] = dataf[feature].map(map_dict)
    return dataf

###### Mapping categorical features like loan and contact

In [None]:
X_train_final=mapping(X_train_final,'loan',{'yes': 1, 'no': 0})
X_test_final=mapping(X_test_final,'loan',{'yes': 1, 'no': 0})

In [None]:
X_train_final=mapping(X_train_final,'contact',{'telephone': 1, 'cellular': 0})
X_test_final=mapping(X_test_final,'contact',{'telephone': 1, 'cellular': 0})

In [None]:
y_train=y_train.map({'yes': 1, 'no': 0})
y_test=y_test.map({'yes': 1, 'no': 0})

In [None]:
X_train_final.reset_index(drop=True, inplace=True)
X_test_final.reset_index(drop=True, inplace=True)

In [None]:
X_train_final.head()

# **Normalization**

In [None]:
def normalization(X_train_num,X_test_num):
    '''
    Function to normalize the numerical features of the train and test set using min max scaler
    
    Parameters:
    X_train_num- dataframe
    X_test_num- feature to frequency encode
    map_dict- dictionary to be used for mapping

    Return:
    X_train_num_norm- updated train dataframe after normalizing the numerical features
    X_test_num_norm- updated test dataframe after normalizing the numerical features
    '''
    scaler = MinMaxScaler()
    X_train_num_norm = pd.DataFrame(scaler.fit_transform(X_train_num).round(4), 
                                      index=X_train_num.index, 
                                      columns=X_train_num.columns)
    
    X_test_num_norm=pd.DataFrame(scaler.transform(X_test_num).round(4), 
                                      index=X_test_num.index, 
                                      columns=X_test_num.columns)
    return X_train_num_norm,X_test_num_norm

In [None]:
X_train_num_norm,X_test_num_norm=normalization(X_train_final[numeric_features],X_test_final[numeric_features])

In [None]:
X_train_final=X_train_num_norm.join(X_train_final[categoric_features])
X_test_final=X_test_num_norm.join(X_test_final[categoric_features])

In [None]:
X_train_final.head()

## **New Features**

In [None]:
# def generate_avg_features(train,test,feature,avg_over_feature,new_feature):
#     avg_feature=train.groupby(feature,as_index=False)[avg_over_feature].mean().round(2)
#     mapping_dict=dict(avg_feature.values)
#     new_train_column= train[feature].map(mapping_dict)
#     train.insert(loc =train.columns.get_loc(avg_over_feature),column = new_feature,value = new_train_column)
#     train.drop(columns=[avg_over_feature],inplace=True)
#     new_test_column=test[feature].map(mapping_dict)
#     test.insert(1,column = new_feature,value = new_test_column)
#     # test.drop(columns=[avg_over_feature],inplace=True)
#     return train,test#loc =test.columns.get_loc(avg_over_feature)

### Avg age per job

In [None]:
# X_train_final,X_test_final=generate_avg_features(X_train_final,X_test_final,'job','age','avg_age_per_job')

In [None]:
# X_train_final.head()

### Avg campaign per month

In [None]:
# X_train_final,X_test_final=generate_avg_features(X_train_final,X_test_final,'month','campaign','avg_campaign_per_month')

In [None]:
# X_train_final.head()

In [None]:
# type(y_train)

# **Undersampling**

In [None]:
renn = RepeatedEditedNearestNeighbours()
X_train_final, y_train=renn.fit_resample(X_train_final, y_train)
counter=Counter(y_train)
print('Distribution of classes for training set')
print(counter)

In [None]:
col_names=numeric_features+categoric_features
X_train_final=X_train_final[col_names]
X_test_final=X_test_final[col_names]

# **Feature Reduction**

## PCA

In [None]:
def pca_analysis(X,y):
    '''
    Function to perform feature analysis using PCA
    
    Parameters:
    df:input data frame

    Return:
    Plot of feature analysis using PCA
    '''
    pca = PCA(n_components=15)
    pca.fit(X,y)
    variance_ratio = pca.explained_variance_ratio_    
    n_components = np.arange(pca.n_components_) + 1
    plt.figure(figsize=(8,8))
    plt.plot(n_components, pca.explained_variance_ratio_.cumsum(), 'o-', linewidth=2, color='teal')
    plt.xlabel('Features (principal components)')
    plt.title(' PCA Plot', fontsize=15,fontweight="bold")
    plt.ylabel('Total Variance')

In [None]:
pca_analysis(X_train_final,y_train)

## SelectKBest

In [None]:
select_k_best=SelectKBest(chi2, k=13)
X_train_final = select_k_best.fit_transform(X_train_final, y_train)
X_test_final=select_k_best.transform(X_test_final)

In [None]:
filter = select_k_best.get_support()
filter=filter.tolist()
print("All features:")
print(col_names)
selected_features=list(compress(col_names, filter))
print("Selected 13 best features:")
print(selected_features)

# Generate Tables for dataset performance on Train and Test 

In [None]:
def generate_train_table(row_data):
    '''
    Function to generate table along with hyperparameters
    '''
    table_header_train=["Algorithm","Optimal Hyperparameters","Specificity","Sensitivity","Accuracy"]
    rows=[]
    for lists in row_data:
        rows.append(lists)

    all_data=rows
    print(tabulate(all_data,headers=table_header_train,tablefmt='grid'))

In [None]:
def generate_test_table(row_data):
    '''
    Function to generate table
    '''
    table_header_test=["Algorithm","Specificity","Sensitivity","Accuracy"]
    rows=[]
    for lists in row_data:
        rows.append(lists)

    all_data=rows
    print(tabulate(all_data,headers=table_header_test,tablefmt='grid'))

# **SUPERVISED LEARNING ALGORITHMS**

### *Trivial Model*

In [None]:
trivial_object = Trivial_Model(X_train_final, y_train)

In [None]:
res_train_trivial,probability_c1=trivial_object.train_trivial('Trivial Model')

In [None]:
res_test_trivial=trivial_object.test_trivial('Trivial Model',probability_c1,X_test_final,y_test)

### *Baseline Model*

In [None]:
baseline_object = Baseline_Model(X_train_final, y_train)

In [None]:
best_baseline_model,res_train_baseline=baseline_object.train_baseline('Baseline Model')

In [None]:
res_test_baseline=baseline_object.test_baseline('Baseline Model',best_baseline_model,X_test_final,y_test)

In [None]:
pickle.dump(best_baseline_model, open('/content/drive/MyDrive/EE_660_Project/SL models/best_baseline_model.pkl', 'wb'))

### *Logistic Regression*

In [None]:
log_reg_clf = LogisticRegression()
log_reg_param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'C': [ 0.01,0.1, 1.0,10],
}
log_reg_object = Logisitic_Regression(log_reg_clf,log_reg_param_grid,X_train_final,y_train)

In [None]:
best_log_reg_model,res_train_log_reg=log_reg_object.train_log_reg('Logistic Regression',0.2)

In [None]:
res_test_log_reg=log_reg_object.test_log_reg('Logistic Regression',best_log_reg_model,X_test_final,y_test,0.2)

In [None]:
pickle.dump(best_log_reg_model, open('/content/drive/MyDrive/EE_660_Project/SL models/best_log_reg_model.pkl', 'wb'))

### *Random Forest*

In [None]:
rf_clf = RandomForestClassifier()
rf_param_grid = {
        'max_depth': [50,60,70],
        'min_samples_split': [20,30,40]
}
rf_object = Random_Forest(rf_clf,rf_param_grid,X_train_final,y_train)

In [None]:
best_rf_model,res_train_rf=rf_object.train_rf('Random Forest Classifier',0.2)

In [None]:
# TEST
res_test_rf=rf_object.test_rf('Random Forest Classifier',best_rf_model,X_test_final,y_test,0.2)

In [None]:
pickle.dump(best_rf_model, open('/content/drive/MyDrive/EE_660_Project/SL models/best_rf_model.pkl', 'wb'))

### *Support Vector Classifier*

In [None]:
svc_clf = SVC(probability=True)
svc_param_grid = {'C':[0.1,1]  }
svc_object = Support_Vector_Classifier(svc_clf,svc_param_grid,X_train_final,y_train)                  

In [None]:
best_svc_model,res_train_svc=svc_object.train_svc('Support Vector Classifier',0.2)

In [None]:
res_test_svc=svc_object.test_svc('Support Vector Classifier',best_svc_model,X_test_final,y_test,0.2)

In [None]:
pickle.dump(best_svc_model, open('/content/drive/MyDrive/EE_660_Project/SL models/best_svc_model.pkl', 'wb'))

### *Multi Layer Perceptron*

In [None]:
mlp_clf = MLPClassifier(max_iter=100)
mlp_param_grid = {
     'activation': ['tanh', 'relu'],
      'alpha': [0.0001, 0.001],
}

mlp_object = Multi_Layer_Perceptron(mlp_clf,mlp_param_grid,X_train_final,y_train)  

In [None]:
best_mlp_model,res_train_mlp=mlp_object.train_mlp('Multi Layer Perceptron',0.2)

In [None]:
res_test_mlp=mlp_object.test_mlp('Multi Layer Perceptron',best_mlp_model,X_test_final,y_test,0.2)

In [None]:
pickle.dump(best_mlp_model, open('/content/drive/MyDrive/EE_660_Project/SL models/best_mlp_model.pkl', 'wb'))

### *Decision Tree Classifier*

In [None]:
dec_tree_clf = DecisionTreeClassifier()

dec_tree_param_grid = [{'max_depth': [50,60,70],
                        'min_samples_leaf': [3, 4, 5]
            }]
dec_tree_object = Decision_Tree(dec_tree_clf,dec_tree_param_grid,X_train_final,y_train)  

In [None]:
best_dec_tree_model,res_train_dec_tree=dec_tree_object.train_dec_tree('Decision Tree Classifier',0.2)

In [None]:
res_test_dec_tree=dec_tree_object.test_dec_tree('Decision Tree Classifier',best_dec_tree_model,X_test_final,y_test,0.2)

In [None]:
pickle.dump(best_dec_tree_model, open('/content/drive/MyDrive/EE_660_Project/SL models/best_dec_tree_model.pkl', 'wb'))

## **Supervised Results**

In [None]:
print('Performance of SL Algorithms on Training Data:')

sl_train_data=[res_train_trivial,res_train_baseline,res_train_log_reg,res_train_rf,res_train_svc,res_train_mlp,res_train_dec_tree]
generate_train_table(sl_train_data)

In [None]:
print('Performance of SL Algorithms on Testing Data:')
sl_test_data=[res_test_trivial,res_test_baseline,res_test_log_reg,res_test_rf,res_test_mlp,res_test_dec_tree]
generate_test_table(sl_test_data)

# **SEMI-SUPERVISED LEARNING ALGORITHMS**

In [None]:
def generate_ssl_dataset(X_train_final,y_train,train_size):
    '''
    Function to generate labeled and unlabeled data
    
    Parameters:
    X_train_final- features
    y_train- labels
    train_size- size of labeled set

    Return:
    X and y for both labeled and unlabeled dataset 
    '''
    X_lab, X_unlab, y_lab, y_unlab=split_train_test_data(X_train_final,y_train,train_size)
    return X_lab, X_unlab, y_lab, y_unlab

In [None]:
def test_s3vm(name,best_model,X,y):
    '''
    Function to find the predicted labels on test set given a trained model
    
    Parameters:
    best_model- target label
    X_test,y_test- predicted label
    threshold- cut off threshold for probability

    Return:
    List of performance metrics Specificity,Sensitivity,Accuracy scores
    '''
    y_pred = best_model.getPredictions(X.values.tolist())
    y_pred_update = [int(x) for x in y_pred]
    for i in range(len(y_pred_update)):
        if y_pred_update[i] == -1:
            y_pred_update[i] = 0
    conf=confusion_matrix(y, y_pred_update, labels = [0, 1])
    tn, fp, fn, tp = conf[0][0],conf[0][1],conf[1][0],conf[1][1]
    specificity = tn / (tn+fp)
    sensitivity=tp/(tp+fn)
    acc=accuracy_score(y,y_pred_update)
    return [name,specificity,sensitivity,acc]

In [None]:
def S3VM_function(name,X_train_full, y_train_full):
    iht = InstanceHardnessThreshold(random_state=0,
                                estimator=LogisticRegression(
                                solver='newton-cg', multi_class='auto'))
    
    X_train_resampled, y_train_resampled = iht.fit_resample(X_train_full, 
                                                            y_train_full)
    
    X_lab_resampled, X_unlab_resampled, y_lab_resampled, y_unlab_resampled=split_train_test_data(X_train_resampled,
                                                                                                 y_train_resampled,
                                                                                                 train_data_size=0.40)
    y_lab_ssl = copy.deepcopy(y_lab_resampled)
    y_lab_ssl[y_lab_ssl==0]=-1 
    s3vm_model = QN_S3VM(X_lab_resampled.tolist(),y_lab_ssl.tolist(),X_unlab_resampled.tolist(),random.Random())
    s3vm_model.train()
    values= test_s3vm(name,s3vm_model,X,y)  
    result_train=[name]+values
    
    return s3vm_model,result_train


## *Prop 1 Nearest Neighbour, Expectation Maximization, Label Propagation and Semi-Supervised Support Vector Machine (S3VM)*

In [None]:
def run_ssl_algorithms(X_train_final,y_train,X_test_final,y_test,labeled_size,sl_flag=False):
    X_lab, X_unlab, y_lab, y_unlab=generate_ssl_dataset(X_train_final,y_train,train_size=labeled_size)
    X_lab_copy, X_unlab_copy, y_lab_copy=copy.deepcopy(X_lab), copy.deepcopy(X_unlab),copy.deepcopy(y_lab)
    X_train_numpy = X_train_final.copy()
    y_train_numpy=y_train

    #1-NN
    print('One NN')
    one_nn_object = Prop_1NN(X_lab_copy, y_lab_copy, X_unlab_copy)
    one_nn_model,res_train_1nn=one_nn_object.train_prop_1nn('Prop 1 Nearest Neighbour',0.95,0.2)
    # pickle.dump(one_nn_model, open('/content/drive/MyDrive/EE_660_Project/SSL models/best_one_nn_model_exptb.pkl', 'wb'))
    res_test_1nn=one_nn_object.test_prop_1nn('Prop 1 Nearest Neighbour',one_nn_model,X_test_final,y_test,0.2)

    #EM
    print('EM')
    em_object = Expectation_Maximization(X_lab_copy, y_lab_copy, X_unlab_copy,y_unlab)
    em_model,res_train_em= em_object.train_em('Expectation Maximization',X_train_numpy,y_train_numpy,0.5)
    res_test_em=em_object.test_em('Expectation Maximization',em_model,X_test_final,y_test,0.5)
    # pickle.dump(em_model, open('/content/drive/MyDrive/EE_660_Project/SSL models/best_em_model_exptb.pkl', 'wb'))
   
    #Label Propagation
    print('Label Propagation')
    label_prop_object = Label_Propagation(X_lab_copy, y_lab_copy, X_unlab_copy,y_unlab)
    best_label_prop_model,res_train_label_prop=label_prop_object.train_label_prop('Label Propagation',0.2)
    # pickle.dump(best_label_prop_model, open('/content/drive/MyDrive/EE_660_Project/SSL models/best_label_prop_model_exptb.pkl', 'wb'))
    res_test_label_prop=label_prop_object.test_label_prop('Label Propagation',best_label_prop_model,X_test_final,y_test,0.2)

    #S3VM
    best_s3vm_model,res_train_s3vm=S3VM_function('Semi Supervised SVM',X_train_final, y_train)
    res_test_s3vm=test_s3vm('Semi Supervised SVM',best_s3vm_model,X_test_final.tolist(),y_test.tolist())
    # pickle.dump(best_s3vm_model, open('/content/drive/MyDrive/EE_660_Project/SSL models/best_s3vm_model_exptb.pkl', 'wb'))

    
    if(sl_flag==False):
        print('Performance of SSL Algorithms on '+str(round(labeled_size*100))+'% Labeled Data during Training:')
        ssl_train_data=[res_train_1nn,res_train_em,res_train_label_prop]#,res_train_s3vm]#,]
        generate_test_table(ssl_train_data)
        print()
        print()
        print('Performance of SSL Algorithms on '+str(round(labeled_size*100))+'% Labeled Data during Testing:')
        ssl_test_data=[res_test_1nn,res_test_em,res_test_label_prop]#,res_test_s3vm]#,res_train_mlp]
        generate_test_table(ssl_test_data)
    else:
        print('SSL Log Reg')
        ssl_log_reg_object = SSL_Log_Reg(X_lab_copy, y_lab_copy, X_unlab_copy,y_unlab)
        ssl_log_reg_model,res_train_ssl_log_reg=ssl_log_reg_object.train_ssl_log_reg('SSL Log Reg',0.95,0.2)
        # pickle.dump(ssl_log_reg_model, open('/content/drive/MyDrive/EE_660_Project/SSL models/ssl_log_reg_model_exptb.pkl', 'wb'))
        res_test_ssl_log_reg=ssl_log_reg_object.test_ssl_log_reg('SSL Log Reg',ssl_log_reg_model,X_test_final,y_test,0.2)
       
        print('SL Logistic Regression')
        log_reg_param_grid ={'C': [10], 'solver': ['lbfgs']} 
        log_reg_clf = LogisticRegression()
        log_reg_object = Logisitic_Regression(log_reg_clf,log_reg_param_grid,X_lab_copy,y_lab_copy)
        best_log_reg_model,res_train_log_reg=log_reg_object.train_log_reg('Logistic Regression',0.2)
        res_test_log_reg=log_reg_object.test_log_reg('Logistic Regression',best_log_reg_model,X_test_final,y_test,0.2)
        # pickle.dump(ssl_log_reg_model, open('/content/drive/MyDrive/EE_660_Project/SSL models/sl_log_reg_model_exptb.pkl', 'wb'))
       
        print('Performance of SSL Algorithms on '+str(round(labeled_size*100))+'% Labeled Data during Training:')
        ssl_train_data=[res_train_1nn,res_train_em,res_train_label_prop,res_train_ssl_log_reg,res_train_log_reg]
        generate_test_table(ssl_train_data)
        print()
        print()
        print('Performance of SSL Algorithms on '+str(round(labeled_size*100))+'% Labeled Data during Testing:')
        ssl_test_data=[res_test_1nn,res_test_em,res_test_label_prop,res_test_ssl_log_reg,res_test_log_reg]
        generate_test_table(ssl_test_data)

#### **SSL Results for 40% Labeled Data**

In [None]:
run_ssl_algorithms(X_train_final,y_train,X_test_final,y_test,labeled_size=0.40,sl_flag=False)

#### **SSL Results for 30% Labeled Data**

In [None]:
run_ssl_algorithms(X_train_final,y_train,X_test_final,y_test,labeled_size=0.30,sl_flag=False)

#### **SSL Results for 20% Labeled Data**

In [None]:
run_ssl_algorithms(X_train_final,y_train,X_test_final,y_test,labeled_size=0.20,sl_flag=False)

#### **SSL Results for 10% Labeled Data**

In [None]:
run_ssl_algorithms(X_train_final,y_train,X_test_final,y_test,labeled_size=0.10,sl_flag=False)

# **EXPERIMENTS**

##  **(A)** *Sensitivity/ Perturbation Analysis*

It is done by adding noise to the numerical features

In [None]:
def add_gaussian_noise(x,mu,std):
    '''Add gaussian noise to dataset'''
    noise = np.random.normal(mu, std, size = x.shape)
    x_noise = x + noise
    return x_noise 

In [None]:
X_train_noisy_final=X_train_final.copy()
X_noisy_numerical=add_gaussian_noise(X_train_final[:,0:6],0,0.15)
X_train_noisy_final[:,0:6]=X_noisy_numerical

#### **SSL Results for Noisy Labeled Data**

In [None]:
print('Performance after adding noise to the labeled data:')
run_ssl_algorithms(X_train_noisy_final,y_train,X_test_final,y_test,labeled_size=0.40,sl_flag=False)

##  **(B)** *Convert SL dataset to a SSL dataset*

In [None]:
run_ssl_algorithms(X_train_final,y_train,X_test_final,y_test,labeled_size=0.30,sl_flag=True)