### IMPORTS



In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer


from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from typing import Union, Tuple, List, Any
import operator
%pip install sweetviz
%pip install imblearn
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import OrderedDict
import sweetviz as sv

In [None]:
np.random.seed(0)
random.seed(0)

### Load Data

In [None]:
def load_dataset(path: Path, return_Xy: bool = True) -> Union[Tuple[np.array, np.array], pd.DataFrame]:
    df = pd.read_csv(path)

    #Todo : Divide into 2 functions

    #df = pd.get_dummies(df, columns=['orbiting_body'])
    df = df.drop(columns=['customerID'])
    df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
    #['TotalCharges'] = df['TotalCharges'].fillna(df.TotalCharges.median())
    df['TotalCharges']=df['TotalCharges'].astype(float)
    df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
    df['Partner'] = df['Partner'].map({'No': 0, 'Yes': 1})
    df['Dependents'] = df['Dependents'].map({'No': 0, 'Yes': 1})
    df['PhoneService'] = df['PhoneService'].map({'No': 0, 'Yes': 1})
    df['PaperlessBilling'] = df['PaperlessBilling'].map({'No': 0, 'Yes': 1})

    X = df.drop(columns=['Churn'],axis=1)
    y = df['Churn']
    if return_Xy:
        return X, y
    return df


### EDA

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
#TO DO: MOUNT DRIVE
_path = "/content/drive/MyDrive/AprendizadoDeMaquina/T2/ibm.csv"
#df=pd.read_csv(_path)
X,y=load_dataset(Path(_path))
df=load_dataset(_path,False)


In [None]:
# check data
print(df.head())

In [None]:
# check types
print(df.info())

In [None]:
# check null content
print(df.isnull().sum())

In [None]:
# check more detailed info
print(df.describe())

In [None]:
# check if target needs pre-processing
print(df.Churn.value_counts())

In [None]:
# pairplot
sns.pairplot(df)
plt.show()

In [None]:
# correlation matrix
corr = df.corr()
plt.figure(figsize=(16, 6))
sns.heatmap(corr,
            annot=True,
            cmap="Blues"
            )
plt.show()

In [None]:
# generate automated report
report = sv.analyze(df)
# display the report
report.show_html('Advertising.html')

### PIPELINE

In [None]:
def evaluate_data(X_train, y_train, X_test, y_test, run: int, rus: bool = True):
    metadata = {}
    #copy just to make sure that data don't leak
    y_test_ = y_test.copy()
    X_train_ = X_train.copy()
    y_train_ = y_train.copy()
    X_test_ = X_test.copy()

    # define estimator
    estimators = (
        SVC(random_state=0), XGBClassifier(random_state=0, scale_pos_weight=3, eval_metric='logloss'), GaussianNB())
    
    #parameter grid for hyperparameter tuning
    param_grid = [[

        {
            # SVM
            'clf__kernel': ['rbf', 'linear'],
            'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'clf__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
        }],

        [
            # XGBOOST
            {"clf__learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
             "clf__max_depth": [3, 4, 5, 7, 9, 11, 13, 15],
             "clf__min_child_weight": [1, 3, 5, 7],
             "clf__gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
             "clf__colsample_bytree": [0.3, 0.4, 0.5, 0.7],
             "clf__n_estimators": np.arange(100, 300, 25),
             }],
        [{
            # NAIVE BAYES
            'clf__var_smoothing': [1e-9]
        }]
    ]

    if run not in metadata:
        metadata[run] = {}
    for i, clf in enumerate(estimators):
        if type(clf).__name__ not in metadata:
            metadata[run][type(clf).__name__] = {}

        #pipeline for preprocessing

        #get numeric features
        numeric_features = ["MonthlyCharges", "TotalCharges", "tenure", "gender", "Partner", "Dependents",
                            "PhoneService", "PaperlessBilling"]
        #pipeline for numeric features with KNNImputer and MinMaxScaler
        numeric_transformer = Pipeline(
            steps=[("knnImp", KNNImputer(n_neighbors=3)), ("scaler", MinMaxScaler())]
        )


        #get categorical features
        categorical_features = ["InternetService", "PaymentMethod", 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                                'DeviceProtection', 'TechSupport', 'StreamingMovies', 'Contract']
        #pipeline for categorical features with OneHotEncoder
        categorical_transformer = Pipeline(
            steps=[("OneHot", OneHotEncoder(handle_unknown="ignore"))]
        )

        # Use columnTransformer to join the two pipelines
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features)
            ]
        )
        #main pipeline with RandomUnderSampler, preprocessing, estimator
        if rus:
            pipe = Pipeline(
                steps=[("rus", RandomUnderSampler(random_state=0)), ("pp", preprocessor), ("clf", clf)]
            )
        else:
           #main pipeline with preprocessing, estimator
            pipe = Pipeline(
                steps=[("pp", preprocessor), ("clf", clf)]
            )

        #Hyperparameter Tuning
        search = RandomizedSearchCV(pipe, param_grid[i], verbose=True, n_iter=20, refit=True, random_state=0)

        #fit data
        search.fit(X_train_, y_train_)
        #predict the X_test
        y_pred = search.predict(X_test_)

        y_pred_ = y_pred.copy()

        y_test_ = np.array(y_test_)
        #make logs
        for metric in ['accuracy', 'precision', 'recall', 'f1measure']:
            if metric not in metadata[run][type(clf).__name__]:
                metadata[run][type(clf).__name__][metric] = {}
        metadata[run][type(clf).__name__]['accuracy'] = accuracy(y_test_, y_pred_)
        metadata[run][type(clf).__name__]['precision'] = precision(y_test_, y_pred_)
        metadata[run][type(clf).__name__]['recall'] = recall(y_test_, y_pred_)
        metadata[run][type(clf).__name__]['f1measure'] = f1Measure(y_test_, y_pred_)
    return metadata



# K-fold cross validation

In [None]:
class Instance:
  def __init__(self, instance_data: Any, label: str):
    self._instance_data = instance_data
    self._label = label
  def data(self):
    return self._instance_data
  def label(self):
    return self._label
  def __repr__(self):
    return f"{self._instance_data}-{self._label}"

In [None]:
def remainders_merge_policy(k_folds, remainders):
    _k_folds = k_folds.copy()
    for i, r in enumerate(remainders):
        _k_folds[i % len(_k_folds)].append(r)
    return _k_folds

# non stratified
def random_k_folds(instance_list: List[Instance], k: int, merge_remainders=True):
    # generate random k folds non stratified
    instances = instance_list.copy()
    random.Random(0).shuffle(instances)
    max_fold_size = len(instances) // k
    k_folds = [instances[i * max_fold_size:(i + 1) * max_fold_size] for i in range(k)]
    remainders = instances[k * max_fold_size:]
    return remainders_merge_policy(k_folds, remainders) if merge_remainders else (k_folds, remainders)


def stratified_k_folds(instance_list: List[Instance], k: int, merge_remainders=True):
    # generate random k folds stratified
    instances = instance_list.copy()
    labels = sorted(set([i.label() for i in instances]))
    labeled_instances = OrderedDict({l: [i for i in instances if i.label() == l] for l in labels})
    label_folds = [random_k_folds(labeled_instances[l], k, merge_remainders=False) for l in labels]
    labeled_k_folds = [[e for l in f for e in l] for f, r in label_folds]
    k_folds = [[e for l in labeled_k_folds for e in l[i * len(l) // k:(i + 1) * len(l) // k]] for i in range(k)]
    k_folds = [sorted(k, key=operator.attrgetter('_instance_data')) for k in k_folds]
    remainders = [e for f, r in label_folds for e in r]
    return remainders_merge_policy(k_folds, remainders) if merge_remainders else (k_folds, remainders)

k=3
n_classes=3
l = [Instance(i, f"{chr(ord('A')+i%n_classes)}") for i in range(11)]
print(f"mock instances: {l}")
print(f"random_k_folds: {random_k_folds(l, k)}")
print(f"stratified_k_folds: {stratified_k_folds(l, k)}")

In [None]:
def return_dfs(X, y, k):
    listt = []

    for i, yy in y.items():
        '''
        cria uma instancia da classe para cada instancia do dataset
        cada instancia possui o index no dataset e a label correspondente
        '''
        listt.append(Instance(i, yy))
    s_folds = stratified_k_folds(listt, k)
    print(s_folds)

    folds = []
    data = []
    labels = []
    for i in range(k):
        '''
        para cada i em k(quantidade de folds) pega o indices que estao no fold
        e utiliza esses indices para montar os fold de dados e de labels
        e em seguida adiciona esse fold criado na lista de todos folds
        '''
        indx = [a._instance_data for a in s_folds[i]]

        folds.append(indx)
        data.append(X.iloc[indx])
        labels.append(y.iloc[indx])

    return data, labels


In [None]:
def k_fold_CV(X, y, k, rus: bool = True):
    data, labels = return_dfs(X, y, k)  #get list of folds of data and labels
    all_metadata = {} 
    for i, (X_test, y_test) in enumerate(zip(data, labels)): 
        '''
             For each fold in the list of folds test the fold with train being
             all the list minus the current fold.
             The same for the labels.

        '''
        indx = list(range(0, k, 1))
        indx.remove(i)
        comp_d = [element for i, element in enumerate(data) if i in indx]
        X_train = pd.concat(comp_d)
        comp_l = [element for i, element in enumerate(labels) if i in indx]
        y_train = pd.concat(comp_l)

        metadata = evaluate_data(X_train, y_train, X_test, y_test, i, rus)
        all_metadata.update(metadata)
    print(all_metadata)
    return all_metadata

### TESTS

In [None]:
#test with k=10 and undersample
data = k_fold_CV(X, y, 10, True)

In [None]:
#test with k=10 and without undersample
data_w = k_fold_CV(X, y, 10, False)

### PLOTS

In [None]:
def get_values(metadata, metric):
  # get values from metadata
    svc = []
    xgb = []
    nb = []
    for key, v in metadata.items():
        for key2, v2 in v.items():
            for key3, v3 in v2.items():
                if key2 == 'SVC' and key3 == metric:
                    svc.append(v3)
                if key2 == 'XGBClassifier' and key3 == metric:
                    xgb.append(v3)
                if key2 == 'GaussianNB' and key3 == metric:
                    nb.append(v3)
    return svc, xgb, nb


def plot_box_metric(metadata, metric):
    # boxplots
    svc, xgb, nb = get_values(metadata, metric)
    names = [['SVC'] * len(svc), ['XGBClassifier'] * len(xgb), ['GaussianNB'] * len(nb)]
    dfp = pd.DataFrame(svc + xgb + nb, columns=['Value'])
    dfp['Estimator'] = ['SVC'] * len(svc) + ['XGBClassifier'] * len(xgb) + ['GaussianNB'] * len(nb)
    ''' 
    print(dfp.loc[dfp['Estimator'] == 'SVC'].mean(numeric_only=True))
    print(dfp.loc[dfp['Estimator'] == 'SVC'].std(numeric_only=True))
    print(dfp.loc[dfp['Estimator'] == 'XGBClassifier'].mean(numeric_only=True))
    print(dfp.loc[dfp['Estimator'] == 'XGBClassifier'].std(numeric_only=True))
    print(dfp.loc[dfp['Estimator'] == 'GaussianNB'].mean(numeric_only=True))
    print(dfp.loc[dfp['Estimator'] == 'GaussianNB'].std(numeric_only=True))
    '''
    sns.boxplot(x='Estimator', y='Value', data=dfp)
    plt.show()

In [None]:
#plot boxplot with undersample
print("accuracy")
plot_box_metric(data, 'accuracy')
print("recall")
plot_box_metric(data, 'recall')
print("precision")
plot_box_metric(data, 'precision')
print("f1-measure")
plot_box_metric(data, 'f1measure')

In [None]:
#plot boxplot without undersample
print("accuracy_no_undersample")
plot_box_metric(data_w, 'accuracy')
print("recall_no_undersample")
plot_box_metric(data_w, 'recall')
print("precision_no_undersample")
plot_box_metric(data_w, 'precision')
print("f1-measure_no_undersample")
plot_box_metric(data_w, 'f1measure')

### Métricas


In [None]:
def confusion_matrix_hand(y_test: np.array, y_pred: np.array):
    ''' 
    recebe o y_true e o y_pred e retorna quantos Verdadeiros positivos,
    verdadeiros negativos, falsos positivos,falsos negativos ocorreram
    '''
    VP, VN, FP, FN = 0, 0, 0, 0
    for y_true, y_p in zip(y_test, y_pred):
        if y_true == 'Yes' and y_p == 'Yes':
            VP += 1
        elif y_true == 'Yes' and y_p =='No':
            FN += 1
        elif y_true == 'No' and y_p == 'No':
            VN += 1
        else:
            FP += 1
    return VP, VN, FP, FN


def accuracy(y_test: np.array, y_pred: np.array):
    #calculo acurácia conforme slides de aula
    VP, VN, FP, FN = confusion_matrix_hand(y_test, y_pred)
    return (VP + VN) / y_test.shape[0]


def precision(y_test: np.array, y_pred: np.array):
    #calculo precisao conforme slides de aula
    VP, VN, FP, FN = confusion_matrix_hand(y_test, y_pred)
    if (VP + FP)==0:
        return 0
    return VP / (VP + FP)


def recall(y_test: np.array, y_pred: np.array):
    #calculo recall conforme slides de aula
    VP, VN, FP, FN = confusion_matrix_hand(y_test, y_pred)
    return VP / (VP + FN)


def f1Measure(y_test: np.array, y_pred: np.array):
    #calculo f1-score conforme slides de aula
    if((precision(y_test, y_pred) + recall(y_test, y_pred)))==0:
        return 0
    return 2 * ((precision(y_test, y_pred) * recall(y_test, y_pred)) /
                (precision(y_test, y_pred) + recall(y_test, y_pred)))

