## Packages

In [None]:
# Numerical Analysis
import pandas as pd
import numpy as np
import os
from collections import Counter


# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler

# Data Setup

In [None]:
# Read data
trainFileName = 'train_ScotiaDSD.csv'
testFileName = 'test_ScotiaDSD.csv'
data = pd.read_csv(os.path.join(os.getcwd(), trainFileName))
evalData = pd.read_csv(os.path.join(os.getcwd(), testFileName))

In [None]:
# BINARY_COLUMNS (not including FRAUD_FLAG), STRING_COLUMNS, NUMERICAL_COLUMNS
BINARY_COLUMNS = [
 'CARD_NOT_PRESENT',
 'FLAG_LX',
 'FLAG_ATM',
 'FLAG_AUTO',
 'FLAG_CASH',
 'FLAG_LS',
 'FLAG_DISCOUNT',
 'FLAG_RECREA',
 'FLAG_ELCTRNCS',
 'FLAG_REG_AMT',
 'FLAG_FASTFOOD',
 'FLAG_GAS',
 'FLAG_HIGH_AMT',
 'FLAG_HIGH_RECREA',
 'FLAG_INTERNET',
 'FLAG_INTERNATIONAL',
 'FLAG_JEWELRY',
 'FLAG_LOW_AMT',
 'FLAG_MANUAL_ENTRY',
 'FLAG_PHONE_ORDER',
 'FLAG_PURCHASE_EXCLUDING_GAS',
 'FLAG_PLANNED',
 'FLAG_RISKY',
 'FLAG_SWIPE',
 'FLAG_TRAVEL_ONLY',
 'FLAG_TRAVEL_AND_ENTERTAINMENT',
 'FLAG_WEEKEND']

STRING_COLUMNS = ['TRANSACTION_ID',
'USER_AGENT',
'CITY',
'EVENT_TIME']

NUMERICAL_COLUMNS = ['EVENT_MONTH',
 'EVENT_DAY_OF_WEEK',
 'AVAIL_CRDT',
 'AMOUNT',
 'CREDIT_LIMIT',
 'MEAN_AUTO_PAST_7DAY',
 'MEAN_LS_PAST_7DAY',
 'MEAN_RECREA_PAST_7DAY',
 'MEAN_REG_AMT_PAST_7DAY',
 'MEAN_FASTFOOD_PAST_7DAY',
 'MEAN_HIGH_AMT_PAST_7DAY',
 'MEAN_HIGH_RECREA_PAST_7DAY',
 'MEAN_INTERNET_PAST_7DAY',
 'MEAN_INTERNATIONAL_PAST_7DAY',
 'MEAN_JEWELRY_PAST_7DAY',
 'MEAN_LOW_AMT_PAST_7DAY',
 'MEAN_MANUAL_ENTRY_PAST_7DAY',
 'MEAN_PHONE_ORDER_PAST_7DAY',
 'MEAN_PLANNED_PAST_7DAY',
 'MEAN_SWIPE_PAST_7DAY',
 'MEAN_TRAVEL_AND_ENTERTAINMENT_PAST_7DAY',
 'MEAN_WEEKEND_PAST_7DAY',
 'MAX_CASH_PAST_7DAY',
 'MAX_LS_PAST_7DAY',
 'MAX_RECREA_PAST_7DAY',
 'MAX_HIGH_AMT_PAST_7DAY',
 'MAX_HIGH_RECREA_PAST_7DAY',
 'MAX_INTERNET_PAST_7DAY',
 'MAX_PHONE_ORDER_PAST_7DAY',
 'MAX_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'MAX_SWIPE_PAST_7DAY',
 'MAX_WEEKEND_PAST_7DAY',
 'STD_LX_PAST_7DAY',
 'STD_FASTFOOD_PAST_7DAY',
 'STD_HIGH_AMT_PAST_7DAY',
 'STD_INTERNET_PAST_7DAY',
 'STD_LOW_AMT_PAST_7DAY',
 'STD_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'STD_SWIPE_PAST_7DAY',
 'STD_TRAVEL_AND_ENTERTAINMENT_PAST_7DAY',
 'SUM_LX_PAST_7DAY',
 'SUM_AUTO_PAST_7DAY',
 'SUM_LS_PAST_7DAY',
 'SUM_RECREA_PAST_7DAY',
 'SUM_GAS_PAST_7DAY',
 'SUM_HIGH_AMT_PAST_7DAY',
 'SUM_INTERNET_PAST_7DAY',
 'SUM_INTERNATIONAL_PAST_7DAY',
 'SUM_LOW_AMT_PAST_7DAY',
 'SUM_MANUAL_ENTRY_PAST_7DAY',
 'SUM_PHONE_ORDER_PAST_7DAY',
 'SUM_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'SUM_PARTIAL_PAST_7DAY',
 'SUM_PLANNED_PAST_7DAY',
 'SUM_SWIPE_PAST_7DAY',
 'SUM_WEEKEND_PAST_7DAY',
 'COUNT_AUTO_PAST_7DAY',
 'COUNT_ELCTRNCS_PAST_7DAY',
 'COUNT_GAS_PAST_7DAY',
 'COUNT_HIGH_AMT_PAST_7DAY',
 'COUNT_INTERNET_PAST_7DAY',
 'COUNT_LOW_AMT_PAST_7DAY',
 'COUNT_MANUAL_ENTRY_PAST_7DAY',
 'COUNT_PHONE_ORDER_PAST_7DAY',
 'COUNT_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'COUNT_SWIPE_PAST_7DAY',
 'COUNT_TRAVEL_AND_ENTERTAINMENT_PAST_7DAY',
 'COUNT_WEEKEND_PAST_7DAY',
 'MEAN_AUTO_PAST_30DAY',
 'MEAN_DISCOUNT_PAST_30DAY',
 'MEAN_RECREA_PAST_30DAY',
 'MEAN_ELCTRNCS_PAST_30DAY',
 'MEAN_REG_AMT_PAST_30DAY',
 'MEAN_HIGH_AMT_PAST_30DAY',
 'MEAN_INTERNET_PAST_30DAY',
 'MEAN_LOW_AMT_PAST_30DAY',
 'MEAN_MANUAL_ENTRY_PAST_30DAY',
 'MEAN_PHONE_ORDER_PAST_30DAY',
 'MEAN_PURCHASE_EXCLUDING_GAS_PAST_30DAY',
 'MEAN_PLANNED_PAST_30DAY',
 'MEAN_SWIPE_PAST_30DAY',
 'MEAN_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY',
 'MEAN_WEEKEND_PAST_30DAY',
 'MAX_AUTO_PAST_30DAY',
 'MAX_LS_PAST_30DAY',
 'MAX_ELCTRNCS_PAST_30DAY',
 'MAX_FASTFOOD_PAST_30DAY',
 'MAX_HIGH_RECREA_PAST_30DAY',
 'MAX_MANUAL_ENTRY_PAST_30DAY',
 'MAX_PHONE_ORDER_PAST_30DAY',
 'MAX_PARTIAL_PAST_30DAY',
 'MAX_RISKY_PAST_30DAY',
 'MAX_WEEKEND_PAST_30DAY',
 'STD_AUTO_PAST_30DAY',
 'STD_LS_PAST_30DAY',
 'STD_RECREA_PAST_30DAY',
 'STD_ELCTRNCS_PAST_30DAY',
 'STD_REG_AMT_PAST_30DAY',
 'STD_HIGH_RECREA_PAST_30DAY',
 'STD_INTERNET_PAST_30DAY',
 'STD_LOW_AMT_PAST_30DAY',
 'STD_MANUAL_ENTRY_PAST_30DAY',
 'STD_PHONE_ORDER_PAST_30DAY',
 'STD_PARTIAL_PAST_30DAY',
 'STD_SWIPE_PAST_30DAY',
 'STD_TRAVEL_ONLY_PAST_30DAY',
 'STD_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY',
 'SUM_AUTO_PAST_30DAY',
 'SUM_LS_PAST_30DAY',
 'SUM_DISCOUNT_PAST_30DAY',
 'SUM_RECREA_PAST_30DAY',
 'SUM_ELCTRNCS_PAST_30DAY',
 'SUM_REG_AMT_PAST_30DAY',
 'SUM_FASTFOOD_PAST_30DAY',
 'SUM_GAS_PAST_30DAY',
 'SUM_HIGH_AMT_PAST_30DAY',
 'SUM_HIGH_RECREA_PAST_30DAY',
 'SUM_INTERNET_PAST_30DAY',
 'SUM_INTERNATIONAL_PAST_30DAY',
 'SUM_LOW_AMT_PAST_30DAY',
 'SUM_MANUAL_ENTRY_PAST_30DAY',
 'SUM_PHONE_ORDER_PAST_30DAY',
 'SUM_PURCHASE_EXCLUDING_GAS_PAST_30DAY',
 'SUM_SWIPE_PAST_30DAY',
 'SUM_TRAVEL_ONLY_PAST_30DAY',
 'SUM_WEEKEND_PAST_30DAY',
 'COUNT_AUTO_PAST_30DAY',
 'COUNT_RECREA_PAST_30DAY',
 'COUNT_REG_AMT_PAST_30DAY',
 'COUNT_FASTFOOD_PAST_30DAY',
 'COUNT_GAS_PAST_30DAY',
 'COUNT_HIGH_AMT_PAST_30DAY',
 'COUNT_INTERNET_PAST_30DAY',
 'COUNT_LOW_AMT_PAST_30DAY',
 'COUNT_MANUAL_ENTRY_PAST_30DAY',
 'COUNT_PHONE_ORDER_PAST_30DAY',
 'COUNT_PURCHASE_EXCLUDING_GAS_PAST_30DAY',
 'COUNT_PLANNED_PAST_30DAY',
 'COUNT_SWIPE_PAST_30DAY',
 'COUNT_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY',
 'COUNT_WEEKEND_PAST_30DAY',
 'PREV_M_INFLATION',
 'PREV_M_UNEMP_RATE']

# Data Analysis
Determine any distinct qualities of the data and decide if any changes are required.

### Results
After having a deep-dive into the dataset, the following conclusions can be made:
- there are three distinct data types:
    - float64 (numerical)
    - int64 (binary)
    - object (string)
The string columns need to be converted into useful numerical features or dropped. For convention purposes, convert all numerical columns into floats and all binary into int (some counts are ints in the data). The actual string columns may be too granular for the model hence were removed.
- the magnitude of the variables differ greatly so normalized is required.
- imbalance of target data (97.5% to 2.5%), therefore oversampling methods need to be used.
- The columns with NAs are unnecessary:
    - for all cases where CITY and USER_AGENT is NA, it is when FLAG_INTERNET == 0
    - keep only the FLAG_INTERNET

In [None]:
# General Description of Data
display(data.info())
display(data.head())
display(data.describe())
print(f'Number of Duplicated Rows: {data.duplicated().sum()}')
print(f'Number of Duplicated Columns: {len(data.columns) == len(np.unique(data.columns))}')
print(f"Proportion of Fraud vs Non-Fraud Transactions: \n {data['FRAUD_FLAG'].value_counts(normalize=True)}")
dataMissing = data.isna().sum(axis=0)
print(f"Number of Rows with NA: \n {dataMissing[dataMissing != 0]}")
print(f"Disparity between magnitude of Numerical Values: {data[NUMERICAL_COLUMNS].to_numpy().max()} and {data[NUMERICAL_COLUMNS].to_numpy().min()}")

# Data Pre-processing
Apply the findings from the Data Analysis step to the data

In [None]:
def data_preprocessing(dataframe):
    """ Pre-process the data.
            - convert dtypes
            - change units to something more useful

    Args:
        data (DataFrame): pandas DataFrame

    Returns:
        DataFrame: same pandas Dataframe but preprocessed
    """
    # Convert binary variables into integers
    
    nonBinaryColumnNamesList = []
    for columnName in dataframe:
        # Binary variables only have 2 unique values
        if len(dataframe[columnName].unique()) == 2:
            dataframe[columnName] = dataframe[columnName].astype('int64')
        elif (dataframe[columnName].dtype == 'int64') or (dataframe[columnName].dtype == 'float64'):
            dataframe[columnName] = dataframe[columnName].astype('float64')

    # Convert EVENT_TIME string into seconds to be useful
    dataframe['EVENT_TIME'] = dataframe['EVENT_TIME'].apply(lambda x: x[:-1] if x[-1] == ':' else x)
    dateTransactionTime = pd.DataFrame(dataframe['EVENT_TIME'].str.split(r':').to_list(), columns=['hour', 'minutes'])
    dateTransactionTime = dateTransactionTime.astype('float64')

    dataTransactionTimeSeconds = dateTransactionTime['hour'] * 60 * 60 + dateTransactionTime['minutes'] * 60
    
    # Drop the useless columns now
    dataframe = dataframe.drop(STRING_COLUMNS, axis = 1)
    dataframe['EVENT_TIME_IN_SECONDS'] = dataTransactionTimeSeconds
    NUMERICAL_COLUMNS.append('EVENT_TIME_IN_SECONDS')
    
    return dataframe

In [None]:
dataPreprocessed = data_preprocessing(data)

# Data Processing

In [None]:
def countNumberOfIQRTestOutliers(dataframe, threshold):

    # IQR Test    
    mean = np.mean(dataframe, axis=0)
    std = np.std(dataframe, axis=0)

    min = mean - threshold * std
    max = mean + threshold * std
    
    # If a row has at least 1 column that fails IQR test, true
    outlierMask = (dataframe < min) | (dataframe > max)
    rowOutlierMask = np.any(outlierMask, axis=1)
    rowOutlierCount = np.sum(outlierMask, axis=1)

    return rowOutlierCount


def majorityVoteOutlierAlgorithm(dataframe, thresholdIQR, thresholdProportion):

    # IQR test to remove outliers        
    rowOutlierCount = countNumberOfIQRTestOutliers(dataframe, thresholdIQR)
    featureSize = len(dataframe.columns)

    # If more than thresholdProportion of the features are outliers according to IQR Test, remove them
    majorityOutlierMask = rowOutlierCount >= (featureSize * thresholdProportion)

    return dataframe.loc[~majorityOutlierMask, :]

In [None]:
dataProcessed = majorityVoteOutlierAlgorithm(dataPreprocessed, thresholdIQR=2.5, thresholdProportion=0.1)

# Feature Selection

In [None]:
y = dataPreprocessed['FRAUD_FLAG']
X = dataPreprocessed.drop(['FRAUD_FLAG'], axis = 1)

In [None]:
def correlationMatrixPredictorVsTarget(X, y, absoluteThreshold):
    corr_list = []
    for col in X:
        corr_list.append(np.corrcoef(X[col], y)[0,1])
    corr_list.sort()
    
    fig,ax=plt.subplots(figsize=(32,20))
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(10)

    ax.barh(X.columns, corr_list)
    plt.grid(True)
    
    corrArray = np.array(corr_list)
    corrMask = np.abs(corrArray) >= absoluteThreshold

    sufficientCorrColumns = np.array(X.columns)[np.where(corrMask)[0]]
    
    return X[sufficientCorrColumns], sufficientCorrColumns

In [None]:
def featureSelection(X, y, featureImportanceThreshold, method):

    if method == 'rf':
        rf = RandomForestClassifier()
        rf.fit(X, y)

        # Greatest to least
        sort = (-rf.feature_importances_).argsort()
        featureImportanceColumns = X.columns[sort]
        featureImportanceValues = rf.feature_importances_[sort]

        # Plot the importance (sorted)
        print(featureImportanceColumns)
        print(np.where(featureImportanceColumns == 'FLAG_INTERNET'))
        plt.barh(featureImportanceColumns, featureImportanceValues)
        plt.xlabel("Feature Importance")
        
        # Keep only interesying columns
        keepColumns = featureImportanceColumns[featureImportanceValues >= featureImportanceThreshold]

        print(f'Original Column Count: {X.shape[1]}')
        print(f'New Column Count: {len(keepColumns)}')
        
    return keepColumns

In [None]:
# featureImportanceThreshold = 0.002

# # For DEBUG purposes:
# xRF, _, yRF, _ = train_test_split(X, y, train_size=0.1, stratify=y, random_state=0)
# # -------------------
# importFeatures = featureSelection(xRF, yRF, featureImportanceThreshold, method='rf')
# xImportant = X[importFeatures]

In [None]:
# from sklearn.decomposition import PCA
# n_components=50
# pca = PCA(n_components)
# xPCA = pca.fit_transform(X)

# xImportant = pd.DataFrame(xPCA, columns = [f'V{i}'for i in range(n_components)])

In [None]:
xImportant, sufficientCorrColumns = correlationMatrixPredictorVsTarget(X, y, absoluteThreshold=0.035)
len(sufficientCorrColumns)

# Data Sampling

### Synthetic Minority Oversampling TEchniques (SMOTE)

#### Description:
- randomly select a point in the minority class
- calculate the euclidean distance between each point in the minority class and find the $k$ nearest points
- randomly select a point among the $k$ points
- generate new data point (randomly on the line, the weight of the equation between two points is randomly generated as well)

### SMOTE-NC (Nominal Continuous)
Extension of SMOTE. In order to include categorical features into the synthetic data generation:
- randomly select a point in the minority class
- calculate the euclidean distance between each point in the minority class and find the $k$ nearest points
    - for each nominal category, substitute in the euclidean distance calculation the MEDIAN of STD of the continous classes
- randomly select a point among the $k$ points
- generate new data point
    - for continous values, same as SMOTE
    - for nominal values, take the randomly selected points nominal value

### SMOTE-ENC (Encoded Nominal and Continuous)

In [None]:
def dataSampling(X, y, testSize, methods):
    xNew, yNew = X, y
    for method in methods:
        print(method)
        print(f"Original: \n{yNew.value_counts()}")
        if (method == 'SMOTE') and np.any(np.where(xNew.dtypes == 'int64')[0]):
            sampler = SMOTENC(categorical_features=np.where(xNew.dtypes == 'int64')[0], sampling_strategy=0.5, random_state=0)
        elif method == 'SMOTENC':
            sampler = SMOTENC(categorical_features=np.where(xNew.dtypes == 'int64')[0], sampling_strategy=0.5, random_state=0)
        elif method == 'OVER_SAMPLE':
            sampler = RandomOverSampler(sampling_strategy=0.5, random_state=0)
        elif method == 'UNDER_SAMPLE':
            sampler = RandomUnderSampler(sampling_strategy=0.8)
        
        xNew, yNew = sampler.fit_resample(xNew, yNew)
        print(f"New: \n{yNew.value_counts()}")
        print('---')
    return xNew, yNew

In [None]:
xTrainDS, xTestDS, yTrainDS, yTestDS = train_test_split(xImportant, y, test_size=0.2, stratify=y)

# print(xTrainDS.shape, xTestDS.shape, "\n" , yTrainDS.value_counts(), "\n", yTestDS.value_counts())

methods = ['SMOTENC', 'UNDER_SAMPLE']
xTrainSampled, yTrainSampled = dataSampling(xTrainDS, yTrainDS, testSize=0.2, methods=methods)

# Modeling Step

In [None]:
def evaluate(model, y_true, y_pred):
    # C_(i,j) = group i predicted to be in group j
    cm = confusion_matrix(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    percision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    print(f"F1 Score: {f1:.4f}")
    print(f"Percision: {percision:.4f}")
    print(f"Recall: {recall:.4f}")
    # print(f"Confusion Matrix: \n {cm}")
    plt.figure(figsize=(4,4))
    plt.title('Confusion Matrix')
    sns.heatmap(cm, annot=True, fmt='.5g')
    return model, cm, f1, percision, recall

In [None]:
def modeling(xTrain, xTest, yTrain, yTest, baseModel):
    # Define the base model
    model = baseModel(random_state=0)

    # Fit the model
    model.fit(xTrain, yTrain)
    yPred = model.predict(xTest)
    
    # Evaluation
    model, cm, f1, percision, recall = evaluate(model, yTest, yPred)
    return model, cm, f1, percision, recall, xTrain, xTest, yTrain, yTest

In [None]:
# Normalize features to keep values in the same scale and improve accuracy/stabilize  models
def normalize(data, normalizerObject, train=True):
    """Normalize the data depending on the choice of normalizerObjecvt.

    Args:
        data (DataFrame): pandas DataFrame
        normalizerObject (sklearn type of normalizer): ex) StandardScaler

    Returns:
        DataFrame: normalized data
        Normalizer Object: fitted normalizer
    """
    if train:
        sc = normalizerObject()
        xNumericalNormalized = pd.DataFrame(sc.fit_transform(data), columns = data.columns, index=data.index)
    else:
        sc = normalizerObject
        xNumericalNormalized = pd.DataFrame(sc.transform(data), columns = data.columns, index=data.index)
        
    return xNumericalNormalized, sc

In [None]:
numericalFeatures = xTrainSampled.dtypes[xTrainSampled.dtypes == 'float64'].index
binaryFeatures = xTrainSampled.dtypes[xTrainSampled.dtypes != 'float64'].index

xNumericalNormalized, sc = normalize(xTrainSampled[numericalFeatures], StandardScaler, train=True)
xTestNumericalNormalized, _ = normalize(xTestDS[numericalFeatures], sc, train=False)

xNormalized = pd.concat([xNumericalNormalized, xTrainSampled[binaryFeatures]], axis=1)

xTestNormalized = pd.concat([xTestNumericalNormalized, xTestDS[binaryFeatures]], axis=1)

In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=30)
# xPCA = pca.fit_transform(xNormalized)
# xTestPCA = pca.transform(xTestNormalized)

# pca.explained_variance_ratio_

In [None]:
xFinal, yFinal = xNormalized, yTrainSampled
xTestFinal, yTestFinal = xTestNormalized, yTestDS

print(xFinal.shape, xTestFinal.shape, yFinal.shape, yTestFinal.shape)

# Logisitc Regression

In [None]:
modelLR, cm, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, LogisticRegression)

# XGBoost

In [None]:
from sklearn.metrics import accuracy_score
import xgboost as xgb

modelXGB, cm, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, xgb.XGBClassifier)

In [None]:
yPred = modelXGB.predict(xTestFinal)

fpMask = (yPred == 1) == (yTestFinal == 0)
fnMask = (yPred == 0) == (yTestFinal == 1)

In [None]:
fpXTest = xTestFinal[fpMask]
fnXTest = xTestFinal[fnMask]

In [None]:
fpXTest

In [None]:
from xgboost import plot_importance

plot_importance(modelXGB, importance_type="cover")

# Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

# modelXGB, cm, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, IsolationForest)


isoF = IsolationForest()
isoF.fit(xFinal)


# Post Analysis

In [None]:
# yPred = modelLR.predict(xTest)

# yFN = np.where((yPred == 0) & (yTest == 1))
# yFP = np.where((yPred == 1) & (yTest == 0))

# xTestFN = xTest.iloc[yFN[0], :]
# xTestFP = xTest.iloc[yFP[0], :]

In [None]:
xTestDSNormalized = pd.DataFrame(sc.transform(xTestDS[numericalFeatures]), columns=numericalFeatures)
xTestDSNormalized = pd.concat([xTestDSNormalized, xTestDS[binaryFeatures].reset_index()], axis=1)

xTestFinal, yTestFinal = xTestDSNormalized, yTestDS
xTestFinal = xTestFinal.drop(["index"], axis=1)