### 1. Library and Data set ups

In [None]:
!pip install yellowbrick
!pip install xgboost==1.4
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,cross_validate,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,precision_recall_curve,roc_auc_score
from sklearn.metrics import average_precision_score

import xgboost as xgb
import pickle

from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import (
    RandomUnderSampler,
    AllKNN,
    TomekLinks,
    NearMiss
)
from imblearn.over_sampling import RandomOverSampler, SMOTE
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC,confusion_matrix

In [None]:
# copying data
!cp  /content/drive/MyDrive/PG_Diploma_AI_ML_2021_UOHYD/PGDAIML_Project_Spam_Clustering/datos/my_final_messages_dt.csv .

In [None]:
# Importing data
messages_df = pd.read_csv('my_final_messages_dt.csv')

In [None]:
# glance at data
messages_df.head()

In [None]:
# totals per spam class
messages_df.value_counts('message_flag')

In [None]:
# Features X and Y 
X_message_preview = messages_df['message_preview']
Y_message_flag = messages_df['message_flag']

In [None]:
# replace few un necessary stuffs
messages_df['message_flag'] = messages_df['message_flag'].replace(np.nan,0.0)

### 2. Text Features and Data Splitting

In [None]:
# Data splitting 
X_train_message_prev,X_test_message_prev,Y_train_message_flag,Y_test_message_flag = train_test_split(X_message_preview,
                                                                                                     Y_message_flag,random_state=2021,
                                                                                                     stratify=Y_message_flag,test_size=0.25)

In [None]:
print(f'1.X_Training set shape is {X_train_message_prev.shape}\n2.X_Testing set shape is {X_test_message_prev.shape}\n3.Y_Training message_flag set shape {Y_train_message_flag.shape}\n4.Y_Testing message_flag set shape {Y_test_message_flag.shape}')


In [None]:
# Define text tfidf vectorizer
my_text_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=None,min_df=0.01)

In [None]:
# fitting the text vectorizer on training set
my_text_vec_fit = my_text_vectorizer.fit(X_train_message_prev)

In [None]:
# creating training and testing vectors
X_train_msg_vec = my_text_vec_fit.transform(X_train_message_prev)
X_test_msg_vec= my_text_vec_fit.transform(X_test_message_prev)

In [None]:
# Standardization initializer
my_txt_scaler = StandardScaler()

In [None]:
# train and test standardized vectors
X_train_msg_vec_std = my_txt_scaler.fit_transform(X_train_msg_vec.toarray())
X_test_msg_vec_std = my_txt_scaler.fit_transform(X_test_msg_vec.toarray())

### 3. Custom Functions to build models on classification algorithms

In [None]:
def my_cf_naive_bayes(X_train,y_train,X_test,y_test,report=False):
    """ Naive bayes algorithm"""
    model_nb = GaussianNB()
    if report is False:
        model_nb.fit(X_train,y_train)
        print(f'----- Model on Train data -----')
        y_pred = model_nb.predict_proba(X_train)
        print(f'Train ROC_AUC Score : {roc_auc_score(y_train,y_pred[:,1])}')
        print(f'----- Model on Test data -----')
        yt_pred = model_nb.predict_proba(X_test)
        print(f'Test ROC_AUC Score : {roc_auc_score(y_test,yt_pred[:,1])}')
    else:
        visualizer = ROCAUC(model_nb, classes=["Legitimate", "Spam"])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.show()
        print(f'--Conf Matrix')
        confusion_matrix(
                        model_nb,
                        X_train, y_train, X_test, y_test,
                        classes=['Legitimate', 'Spam'])

In [None]:
def my_cf_logisticregression(X_train,y_train,X_test,y_test,report=False):
    """" Logistic Regression"""
    model_lr = LogisticRegression()
    if report is False:
        model_lr.fit(X_train,y_train)
        print(f'----- Model on Train data -----')
        y_pred = model_lr.predict_proba(X_train)
        print(f'Train ROC_AUC Score : {roc_auc_score(y_train,y_pred[:,1])}')
        print(f'----- Model on Test data -----')
        yt_pred = model_lr.predict_proba(X_test)
        print(f'Test ROC_AUC Score : {roc_auc_score(y_test,yt_pred[:,1])}')
    else:
        visualizer = ROCAUC(model_lr, classes=["Legitimate", "Spam"])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.show()
        print(f'--Conf Matrix')
        confusion_matrix(
                        model_lr,
                        X_train, y_train, X_test, y_test,
                        classes=['Legitimate', 'Spam'])

In [None]:
def my_cf_rf(X_train,y_train,X_test,y_test,report):
    """"Random forest"""
    model_rf = RandomForestClassifier(random_state=2022)

    if report is False:
        model_rf.fit(X_train,y_train)
        print(f'----- Model on Train data -----')
        y_pred = model_rf.predict_proba(X_train)
        print(f'Train ROC_AUC Score : {roc_auc_score(y_train,y_pred[:,1])}')
        print(f'----- Model on Test data -----')
        yt_pred = model_rf.predict_proba(X_test)
        print(f'Test ROC_AUC Score : {roc_auc_score(y_test,yt_pred[:,1])}')
    else:
        visualizer = ROCAUC(model_rf, classes=["Legitimate", "Spam"])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.show()
        print(f'--Conf Matrix')
        confusion_matrix(
                        model_rf,
                        X_train, y_train, X_test, y_test,
                        classes=['Legitimate', 'Spam'])

In [None]:
def my_cf_xgb(X_train,y_train,X_test,y_test,report):
    """ XGB """
    model_xgb = xgb.XGBClassifier(random_state=2022)
    
    if report is False:
        model_xgb.fit(X_train,y_train,early_stopping_rounds=10,eval_set=[(X_test,y_test)])
        print(f'----- Model on Train data -----')
        y_pred = model_xgb.predict_proba(X_train)
        print(f'Train ROC_AUC Score : {roc_auc_score(y_train,y_pred[:,1])}')
        print(f'----- Model on Test data -----')
        yt_pred = model_xgb.predict_proba(X_test)
        print(f'Test ROC_AUC Score : {roc_auc_score(y_test,yt_pred[:,1])}')
    else:
        visualizer = ROCAUC(model_xgb, classes=["Legitimate", "Spam"])
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.show()
        print(f'--Conf Matrix')
        confusion_matrix(
                        model_xgb,
                        X_train, y_train, X_test, y_test,
                        classes=['Legitimate', 'Spam'])

### Classification Tech : SGD Classifier from Scratch

In [None]:
# Initialize weights
def initialize_weights(dim):
    ''' In this function, we will initialize our weights and bias'''
    d = dim.shape[0]
    w=np.full((d),0)
    b=0.0
    return w,b

In [None]:
# Compute sigmoids
def sigmoid(z):
    ''' In this function, we will return sigmoid of z'''
    # compute sigmoid(z) and return
    sigmoid= 1.0/(1 + np.exp(-z))
    return sigmoid

In [None]:
# log loss 
def logloss(y_true,y_pred):
    '''In this function, we will compute log loss '''
    log_loss = -1 * np.mean(y_true*(np.log10(y_pred)) + (1-y_true)*np.log10(1-y_pred))
    return log_loss

In [None]:
# Compute gradients
def gradient_dw(x,y,w,b,alpha,N):
    '''In this function, we will compute the gardient w.r.to w '''
    gdb = y - sigmoid(np.dot(w,x.T)+b)
    x_gdb = np.dot(gdb,x)
    a_n =(alpha/N)*w
    return x_gdb - a_n

In [None]:
# Compute gradients
def gradient_db(x,y,w,b):
     '''In this function, we will compute gradient w.r.to b '''
     z = np.dot(w,x.T) + b
     zs= sigmoid(z)
     db = y - zs
     return db
    

In [None]:
def train(X_train,y_train,X_test,y_test,epochs,alpha,eta0):
    ''' logistic regression'''
    train_loss=[]
    test_loss=[]
    train_loss_avg =[]
    test_loss_avg = []
    w,b = initialize_weights(X_train[0])
    num_epochs=0

    while num_epochs <epochs:
        num_epochs+=1
        for x,y,xt,yt in zip(X_train,y_train,X_test,y_test):
            dw = gradient_dw(x,y,w,b,alpha,X_train.shape[0])
            db = gradient_db(x,y,w,b)
            #Train preds
            y_pred = sigmoid(np.dot(w,x.T)+b)
            t_loss = logloss(y, y_pred)
            train_loss.append(t_loss)
            w = w + eta0 * dw
            b = np.mean(b + eta0 * db)
            yt_pred = sigmoid(np.dot(w,xt.T)+b)
            te_loss = logloss(yt, yt_pred)
            test_loss.append(te_loss)
        print(f'num_epochs {num_epochs} and avg train loss {np.mean(train_loss)} and avg test loss {np.mean(test_loss)}')
        train_loss_avg.append(np.mean(train_loss))
        test_loss_avg.append(np.mean(test_loss))
        train_loss.clear()
        test_loss.clear()
 
    return w,b,train_loss_avg,test_loss_avg

In [None]:
# Parameters
alpha=0.0001
eta0=0.0001
N=len(X_train_msg_vec_std)
epochs= 20

In [None]:
w,b,train_loss,test_loss = train(X_train_msg_vec_std, Y_train_message_flag,X_test_msg_vec_std,Y_test_message_flag,epochs,alpha,eta0)

In [None]:
SGD_train_test_loss_df = pd.DataFrame({'train_loss':train_loss,'test_loss':test_loss}
                                      )

In [None]:
def pred(w,b, X):
    N = len(X)
    predict = []
    for i in range(N):
        z=np.dot(w,X[i])+b
        if sigmoid(z) >= 0.78: # sigmoid(w,x,b) returns 1/(1+exp(-(dot(x,w)+b)))
            predict.append(1)
        else:
            predict.append(0)
    return np.array(predict)
print(np.round(1-np.sum(Y_train_message_flag - pred(w,b,X_train_msg_vec_std))/len(X_train_msg_vec_std),4))
print(np.round(1-np.sum(Y_test_message_flag  - pred(w,b,X_test_msg_vec_std))/len(X_test_msg_vec_std),4))

### 4. Base Approaches - Modeling Techniques

In [None]:
my_cf_naive_bayes(X_train_msg_vec_std, Y_train_message_flag,X_test_msg_vec_std,Y_test_message_flag,report=True)


In [None]:
my_cf_logisticregression(X_train_msg_vec_std, Y_train_message_flag,X_test_msg_vec_std,Y_test_message_flag,report=True)

In [None]:
my_cf_xgb(X_train_msg_vec_std, Y_train_message_flag,X_test_msg_vec_std,Y_test_message_flag,report=True)



### 5. Class Imbalance - Random Under and Over Sampling Techniques

#### Sampling techniques definations

In [None]:
my_samplers_under_over_dict = {

    'random_under': RandomUnderSampler(
        sampling_strategy='auto',
        random_state=0,
        replacement=False),
    'random_over': RandomOverSampler(
        sampling_strategy='auto',
        random_state=0)
}

In [None]:
my_under_sampler_dict = {

    'random': RandomUnderSampler(
        sampling_strategy='auto',
        random_state=0,
        replacement=False),

    'tomek': TomekLinks(
        sampling_strategy='auto',
        n_jobs=20),

    'allknn': AllKNN(
        sampling_strategy='auto',
        n_neighbors=8,
        kind_sel='all',
        n_jobs=20),
    
    'nm1': NearMiss(
        sampling_strategy='auto',
        version=1,
        n_neighbors=8,
        n_jobs=20)
}


In [None]:
my_over_sampler_dict = {

    'rand_over' : RandomOverSampler(
        sampling_strategy='auto',
        random_state=0),

    'smote' : SMOTE(
        sampling_strategy='auto',  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=8,
        n_jobs=20)

}

#### Custom functions to make a pipeline to see which algorithms are performed better over different sampling technques

In [None]:
def my_cf_make_model(x_train,y_train,sampler):

    model_nb = LogisticRegression()

    pipe_de_model = make_pipeline(
        sampler,
        model_nb

    )
    resultados_cv = cross_validate(
        pipe_de_model,
        x_train,
        y_train,
        scoring="roc_auc",
        cv=2
    )

    return resultados_cv['test_score'].mean(), resultados_cv['test_score'].std()


In [None]:
def my_cf_imbalance_sample_selections(x_train,y_train,sampler):
    models = {
        'logistic':LogisticRegression(),
        'naivebayes': GaussianNB(),
        'xgb': xgb.XGBClassifier(random_state=2022)
    }

    for mname,mparam in models.items():
        pipe_de_model = make_pipeline(
            sampler,
            mparam
        )
        resultados_cv = cross_validate(
            pipe_de_model,
            x_train,
            y_train,
            scoring="roc_auc",
            cv=3
        )
        mean_score = resultados_cv['test_score'].mean()
        print(f'model : {mname} : performaces:{ mean_score} ')

In [None]:
for key,sampler in my_samplers_under_over_dict.items():
    print(f'Sampling Technique: {key}')
    my_cf_imbalance_sample_selections(X_train_msg_vec_std, Y_train_message_flag,sampler)

### 6.Model Selection and Hyper parameter tuning

In [None]:
my_RUS_spec = RandomOverSampler(random_state=2020)
X_msgprev_res,Y_msg_flag_res =my_RUS_spec.fit_resample(X_train_msg_vec_std,Y_train_message_flag)

In [None]:
modelo_gbm = xgb.XGBClassifier(random_state=2021)

In [None]:
param_grid = [
    {'n_estimators':[900,1000,1500,2000],
    'max_depth':[5,7,10],
    'learning_rate':[0.1,0.3,0.5,0.8],
     'booster':['dart','gbtree'],
     'gamma':[0.1,0.3,0.5],
     'subsample':[0.5,0.9],
     'colsample_bytree':[0.5,0.9],
     'colsample_bylevel':[0.5,0.9],
     'colsample_bynode':[0.5,0.9],
     'reg_lambda':[1,10,20]}
]

In [None]:
# set up the search
busqueda = GridSearchCV(modelo_gbm, param_grid,scoring='roc_auc', cv=3, refit=True )

# find best hyperparameters
busqueda.fit(X_train_msg_vec_std, Y_train_message_flag)

In [None]:
def my_cf_hyp_xgb(X_train,y_train,X_test,y_test):
    model_xgb = xgb.XGBClassifier(n_estimators=2500,max_depth=7,learning_rate=0.898568,booster='dart',gamma=0.010000,subsample=0.614947,
     colsample_bytree=0.5,
     colsample_bylevel=0.5,
     colsample_bynode=0.5,
     reg_lambda=10)
    model_xgb.fit(X_train,y_train,early_stopping_rounds=40,eval_set=[(X_test,y_test)])
    print(f'----- Model on Train data -----')
    y_pred = model_xgb.predict_proba(X_train)
    print(f'Train ROC_AUC Score : {roc_auc_score(y_train,y_pred[:,1])}')
    #print(f'----- Model on Test data -----')
    #yt_pred = model_xgb.predict_proba(X_test)
    #print(f'Test ROC_AUC Score : {roc_auc_score(y_test,yt_pred[:,1])}')

In [None]:
my_cf_hyp_xgb(X_msgprev_res,Y_msg_flag_res,X_test_msg_vec_std,Y_test_message_flag)

In [None]:
model_xgb = xgb.XGBClassifier(objective='binary:logistic',
                              n_estimators=1000,
                              eval_metric='auc',
                              use_label_encoder=False,
                              max_depth=7,
                              learning_rate=0.898568,
                              booster='dart',
                              gamma=0.010000,
                              subsample=0.614947,
                              colsample_bytree=0.5,
                              colsample_bylevel=0.5,
                              colsample_bynode=0.5,
                              reg_lambda=10)

In [None]:
model_xgb.fit(X_train_msg_vec_std,
              np.array(Y_train_message_flag),
              early_stopping_rounds=2,
              eval_set=[(X_test_msg_vec_std,np.array(Y_test_message_flag))])

### 7. LSH from Scratch

In [None]:
def generate_hyperplanes(n,tot):
    """ Custom function to generate required hyperplanes """
    np.random.seed(0)
    hyper_array=[]
    for _ in range(0,n):
        hyper_array.append(np.random.normal(0,1,tot))
    return np.array(hyper_array)

In [None]:
# Creating five hyperplanes
hypers = generate_hyperplanes(5,X_train_msg_vec_std.shape[1])

In [None]:
def wt_trans_x(sparse_mat,hyper_array):
    """ Custom function to caluclate W_Trans_X """
    trans_list = list()
    for fet in sparse_mat:
        #trans_list.append(fet.dot(hyper_array.T))
        trans_list.append(np.dot(fet,hyper_array.T))
    return trans_list

In [None]:
def wt_trans_x_np(sparse_mat,hyper_array):
    return sparse_mat.dot(hyper_array.T)

In [None]:
def hash_key(vector):
    """Generate a hashkey tupple with 1's and 0's"""
    key = tuple(map(lambda x: 1 if x>0 else 0,vector))
    return key

In [None]:
def create_hash_key(vec):
    """create WtansX and generate hashkey on it"""
    wt_x_vec= wt_trans_x(vec,hypers)
    hk = hash_key(wt_x_vec)
    return hk

In [None]:
def create_hash_table(arr):
    """Generate a hashtable"""
    my_hash_table=dict()
    for idx,vec in enumerate(arr):
        key_gen=hash_key(vec)
        if key_gen not in my_hash_table.keys():
            my_hash_table[key_gen]=0
        my_hash_table[key_gen]=[]
    
    for idx,vec in enumerate(arr):
        key_gen=hash_key(vec)
        if key_gen in my_hash_table.keys():
            my_hash_table[key_gen].append(idx)
    return my_hash_table

In [None]:
# Caluclating W_trans_X on training features and creating a hashtable on it
x_train = wt_trans_x(X_train_msg_vec_std,hypers)
x_train_hast_table = create_hash_table(x_train)

In [None]:
def pred_nearest_neighbor_lsh_labels(train_data,train_features,test_features,x_hash_table,num_of_nbrs):
    from collections import Counter
    from numpy.linalg import norm

    """Custom function to caluclate cosine similarities, find the required NNBs labels for the given train and test datasets"""
    #list to store indices of the required NNB's    
    label_idx=list()
    # a dict to store the counted predicted labels using the indices
    label_pred_dict=dict()
    # a list to store the finalized predicted label
    pred_labels=list()

    for fet in test_features.keys():
        #key_gen = create_hash_key(fet)
        neighbours_x = x_train_hast_table[fet]
        neighbours_x_arr = np.array(neighbours_x)
        cosine_similarities=[]
        for nbr in neighbours_x_arr:
            cos_sim=np.dot(train_features[nbr],fet.T).todense().item()/(norm(train_features[nbr].toarray())*norm(fet.T.toarray()))
            cosine_similarities.append(cos_sim)
        n_11_neighbors=neighbours_x_arr[np.argsort(cosine_similarities)[::-1][:num_of_nbrs]]
        label_idx.append(n_11_neighbors)
    
    for idx,item in enumerate(label_idx):
        label_pred_dict[idx]=Counter(list(train_data.iloc[item,0]))

    for labels in label_pred_dict.values():
        pred_labels.append(max(labels,key=lambda x:labels[x]))
        
    return pred_labels

In [None]:
x_test = wt_trans_x(X_test_msg_vec_std,hypers)
x_test_hast_table = create_hash_table(x_test)

In [None]:
# predicting labels of test data by providing training data text features
my_pred_labels = pred_nearest_neighbor_lsh_labels(X_message_preview,X_train_msg_vec_std,x_test_hast_table,x_train_hast_table,11)

### 8.File collections for model deployment

In [None]:
modelo_gbm_pickle_file = open('sms_email_classifier.pkl','wb')
pickle.dump(model_xgb,modelo_gbm_pickle_file)
modelo_gbm_pickle_file.close()

In [None]:
model_tfidf_vect_file = open('sms_email_tfidf_vect.pkl','wb')
pickle.dump(my_text_vectorizer,model_tfidf_vect_file)
model_tfidf_vect_file.close()

In [None]:
my_df = pd.DataFrame({'message_preview':['Your account is locked due to inactivity reactivate it Bank of america']})

In [None]:
my_msg = my_df.message_preview

In [None]:
my_msg

In [None]:
tf_dep = pickle.load(open('/content/sms_email_tfidf_vect.pkl','rb'))

In [None]:
txt_msg = tf_dep.transform(['cannalert grand opening  crestmore smoke house time patients  digitextracted  gram  digitextracted ths bogo carts edibles wax  digitextracted  valley blvd bloomington'])

In [None]:
model_dep = pickle.load(open('/content/sms_email_classifier.pkl','rb'))

In [None]:
def my_dep_text_transformer(msg):
    return tf_dep.transform(msg)

In [None]:
def my_dep_predictor(msg_trans):
    return model_dep.predict(msg_trans)[0]

In [None]:
model_dep.predict_proba(txt_msg)[0]

In [None]:
a = my_dep_text_transformer(['wesley create income streams online Great pay Why not begin today! http://onlineinformations.net To unsub text STOP reply HELP for help'])

In [None]:
model_dep.predict(a)[0]