# Set up

### Import Data & Packages

In [142]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn import datasets
import sklearn.metrics as skm

# data = np.loadtxt("C:/Users/aceme/OneDrive/Documents/GitHub/BP24/AceMejiaSanchez/Data/gaussian_small_d_1.tex")
data = np.loadtxt("C:/Users/aceme/OneDrive/Documents/GitHub/BP24/Kate/Data/uniform_small_d_1.tex")
# data = np.loadtxt("C:/Users/aceme/OneDrive/Documents/GitHub/BP24/Fabiana/Demos Fabi/uniform_large_d_1.tex")
# data = np.loadtxt("C:/Users/aceme/OneDrive/Documents/GitHub/BP24/Ellee/Data/gaussian_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df = pd.DataFrame(array)

### Prepping data & full training set

In [145]:
# Converting 25 columns from numerical floats -> categorical integers
for i in range(25):
    
    df.iloc[:,i] = df.iloc[:,i].round() # Rounding
    df.iloc[:,i] = df.iloc[:,i].astype(int) # Integer

# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:150], df.iloc[:,-1], test_size=0.2, random_state=52)

# Forming complete training set by attaching X_train and y_train
train_combined = pd.concat([X_train, y_train], axis = 1)
train_combined.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
159,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.048794,0.116295,0.750023,0.504721,0.482695,0.061228,0.800553,0.815441,0.997321,0.0
198,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.63819,0.252662,0.584352,0.731232,0.420754,0.293242,0.294986,0.651735,0.50497,1.0
259,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.905198,0.530844,0.446072,0.157876,0.661326,0.562504,0.47481,0.18905,0.60292,0.0
301,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.692931,0.695386,0.780015,0.914984,0.757668,0.329954,0.86552,0.779557,0.979039,0.0
220,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.458531,0.257464,0.371455,0.015987,0.082521,0.108772,0.681779,0.217713,0.070986,0.0


### Applying SMOTE & XGBoost 100 times

In [115]:
# Initialize an array of zeros with size 100
f1_scores_array = np.zeros(100)

# Loop 100 times to populate the array
for i in range(100):
    
    ##################################### Imbalancing: Class 0 Majority #####################################
    
    # Assuming df_table is your DataFrame and the label column is at index 150
    # Create a new label column with 300 zeros and 200 ones
    label1 = np.array([0] * 250 + [1] * 150)
    
    # Shuffle randomly the new label column
    np.random.seed(1)
    np.random.shuffle(label1)
    
    # Replace the label column with the new imbalanced labels
    train_combined['label_0_majority'] = label1
    #print(train_combined['label_0_majority'].value_counts())
    
    ##################################### Imbalancing: Class 1 Majority #####################################
    
    # Create a second imbalanced label column with 200 zeros and 300 ones
    label2 = np.array([0] * 150 + [1] * 250)
    
    # Shuffle randomly the new label column
    np.random.seed(1)
    np.random.shuffle(label2)
    
    # Add the new label column to the DataFrame
    train_combined['label_1_majority'] = label2
    
    #print(train_combined['label_1_majority'].value_counts())
    
    ##################################### Subsets #####################################
    
    # Subset for the 1 majority
    df_imbalance_0 = train_combined.drop(columns=[150, 'label_1_majority']) # Drop original label in Col 150 & label_1_majority
    df_imbalance_0.head()
    
    # Subset for the 0 majority
    df_imbalance_1 = train_combined.drop(columns=[150, 'label_0_majority']) # Drop original label in Col 150 & label_1_majority
    df_imbalance_1.head()
    
    ##################################### Data Augmentation: SMOTE #####################################
    
    ####### SMOTE applied to label_0_majority class
    
    # Creating x and y 
    x=df_imbalance_0.drop(['label_0_majority'],axis=1)
    y=df_imbalance_0['label_0_majority'] # creating imbalance
    
    # Applying SMOTE to balance classes
    smote=SMOTE(sampling_strategy='minority')  # generating synthetic samples for minority class
    x,y=smote.fit_resample(x,y) # re-sampling
    
    ####### Creating a new training set with the new 100 rows (Class Majority 0) we augmented and the old training set
    
    # Re-combine balanced x and y into a dataframe
    new_train_combined_0 = pd.concat([x, y], axis = 1)
    
    # Re-naming the 'label_0_majority' label by its index 150, which creates a duplicate, & dropping the remaining 'label_0_majority'
    new_train_combined_0[150] = new_train_combined_0['label_0_majority']
    new_train_combined_0 = new_train_combined_0.drop(columns=['label_0_majority']) 
    
    # Sanity checks: making sure the right columns are there & the correct # of class values
    # print(new_train_combined_0[150].value_counts()) 
    # new_train_combined_0.head()
    
    # Subset the last 100 rows that were augmented in new_train_combined_0
    aug_0 = new_train_combined_0.tail(100)
    
    # Dropping extra labels to have right dimensions
    train_combined = train_combined.drop(columns=['label_0_majority', 'label_1_majority']) 
    
    # Checking it concatenated properly
    # Check the shape of the original dataFrames
    # print("Shape of train_combined:", train_combined.shape)
    # print("Shape of aug_0:", aug_0.shape)
    
    # Concatenating the dataframes
    train_combined_PRE_FINAL = pd.concat([train_combined, aug_0], ignore_index=True)
    
    # Check the shape of the combined dataframe
    # print("Shape of train_combined_PRE_FINAL:", train_combined_PRE_FINAL.shape)
    
    # Print the combined dataframe
    # print(train_combined_PRE_FINAL)
    
    ####### SMOTE applied to label_1_majority class
    
    # Creating x and y 
    x=df_imbalance_1.drop(['label_1_majority'],axis=1)
    y=df_imbalance_1['label_1_majority'] # creating imbalance
    
    # Applying SMOTE to balance classes
    smote=SMOTE(sampling_strategy='minority')  # generating synthetic samples for minority class
    x,y=smote.fit_resample(x,y) # re-sampling
    
    ####### Creating a new training set with the new 100 rows (Class Majority 1) we augmented and the old training set
    
    # Re-combine balanced x and y into a dataframe
    new_train_combined_1 = pd.concat([x, y], axis = 1)
    
    # Re-naming the 'label_1_majority' label by its index 150, which creates a duplicate, & dropping the remaining 'label_1_majority'
    new_train_combined_1[150] = new_train_combined_1['label_1_majority']
    new_train_combined_1 = new_train_combined_1.drop(columns=['label_1_majority']) 
    
    # Sanity checks: making sure the right columns are there & the correct # of class values
    # print(new_train_combined_1[150].value_counts()) 
    # new_train_combined_1.head()
    
    # Subset the last 100 rows that were augmented in new_train_combined_0
    aug_1 = new_train_combined_1.tail(100)
    
    # Checking it concatenated properly
    # Check the shape of the original dataFrames
    # print("Shape of train_combined:", train_combined_PRE_FINAL.shape)
    # print("Shape of aug_0:", aug_1.shape)
    
    # Concatenating the DataFrames
    new_train_combined = pd.concat([train_combined_PRE_FINAL, aug_1], ignore_index=True)
    
    # Check the shape of the combined dataFrame
    # print("Shape of new_train_combined:", new_train_combined.shape)
    
    # Print the combined dataFrame
    # print(new_train_combined)
    
    ##################################### Classifier: XGBOOST #####################################
    
    # Prepping our new dataset by 'splitting' it
    X_train = new_train_combined.iloc[:,0:150]
    y_train = new_train_combined.iloc[:,-1]
    
    # First, put this prompt: "conda install -c conda-forge py-xgboost" in anaconda to download xgboost package
    # install xgboost in jupyter
    # !pip install xgboost
    
    # create model instance
    # n_estimators: number of trees(estimators) the model uses --> the more used, the more accurate the model is
    # max_depth: maximum depth of tree --> higher number makes model more complex, but too high can cause overfitting
    # learning_rate: quantifies each tree's contribution to total prediction --> lower number takes longer, but can lead to better generalization
    # objective: binary:logistic outputs probabilities. if classification is wanted, use binary:hinge
    bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic', enable_categorical=True)
    
    # fit model with the training data
    bst.fit(X_train, y_train)
    
    # make predictions for the test dataset
    preds = bst.predict(X_test)
    
    # print predictions
    # print(preds)
    
    # print model Accuracy (how often the classifier is correct)
    # print("Accuracy:",metrics.accuracy_score(y_test, preds))

    # Assign the accuracy directly to array
    f1_scores_array[i] = skm.f1_score(y_test, preds)  


In [116]:
print(f1_scores_array)

[0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148
 0.72131148 0.72131148 0.72131148 0.72131148 0.72131148 0.7213

### Saving full training and augmented dataset combo 

In [7]:
# Save dataframe 
# new_train_combined.to_csv('new_train_combined.csv', index = False)

# Timing 1 run of SMOTE Aug & XGBoost

In [147]:
# import time

# start_time = time.time()
# ##################################### Imbalancing: Class 0 Majority #####################################
# # Assuming df_table is your DataFrame and the label column is at index 150
# # Create a new label column with 300 zeros and 200 ones
# label1 = np.array([0] * 250 + [1] * 150)

# # Shuffle randomly the new label column
# np.random.seed(1)
# np.random.shuffle(label1)

# # Replace the label column with the new imbalanced labels
# train_combined['label_0_majority'] = label1
# #print(train_combined['label_0_majority'].value_counts())

# ##################################### Imbalancing: Class 1 Majority #####################################

# # Create a second imbalanced label column with 200 zeros and 300 ones
# label2 = np.array([0] * 150 + [1] * 250)

# # Shuffle randomly the new label column
# np.random.seed(1)
# np.random.shuffle(label2)

# # Add the new label column to the DataFrame
# train_combined['label_1_majority'] = label2

# #print(train_combined['label_1_majority'].value_counts())

# ##################################### Subsets #####################################

# # Subset for the 1 majority
# df_imbalance_0 = train_combined.drop(columns=[150, 'label_1_majority']) # Drop original label in Col 150 & label_1_majority
# df_imbalance_0.head()

# # Subset for the 0 majority
# df_imbalance_1 = train_combined.drop(columns=[150, 'label_0_majority']) # Drop original label in Col 150 & label_1_majority
# df_imbalance_1.head()

# ##################################### Data Augmentation: SMOTE #####################################

# ####### SMOTE applied to label_0_majority class

# # Creating x and y 
# x=df_imbalance_0.drop(['label_0_majority'],axis=1)
# y=df_imbalance_0['label_0_majority'] # creating imbalance

# # Applying SMOTE to balance classes
# smote=SMOTE(sampling_strategy='minority')  # generating synthetic samples for minority class
# x,y=smote.fit_resample(x,y) # re-sampling

# ####### Creating a new training set with the new 100 rows (Class Majority 0) we augmented and the old training set

# # Re-combine balanced x and y into a dataframe
# new_train_combined_0 = pd.concat([x, y], axis = 1)

# # Re-naming the 'label_0_majority' label by its index 150, which creates a duplicate, & dropping the remaining 'label_0_majority'
# new_train_combined_0[150] = new_train_combined_0['label_0_majority']
# new_train_combined_0 = new_train_combined_0.drop(columns=['label_0_majority']) 

# # Sanity checks: making sure the right columns are there & the correct # of class values
# # print(new_train_combined_0[150].value_counts()) 
# # new_train_combined_0.head()

# # Subset the last 100 rows that were augmented in new_train_combined_0
# aug_0 = new_train_combined_0.tail(100)

# # Dropping extra labels to have right dimensions
# train_combined = train_combined.drop(columns=['label_0_majority', 'label_1_majority']) 

# # Checking it concatenated properly
# # Check the shape of the original dataFrames
# # print("Shape of train_combined:", train_combined.shape)
# # print("Shape of aug_0:", aug_0.shape)

# # Concatenating the dataframes
# train_combined_PRE_FINAL = pd.concat([train_combined, aug_0], ignore_index=True)

# # Check the shape of the combined dataframe
# # print("Shape of train_combined_PRE_FINAL:", train_combined_PRE_FINAL.shape)

# # Print the combined dataframe
# # print(train_combined_PRE_FINAL)

# ####### SMOTE applied to label_1_majority class

# # Creating x and y 
# x=df_imbalance_1.drop(['label_1_majority'],axis=1)
# y=df_imbalance_1['label_1_majority'] # creating imbalance

# # Applying SMOTE to balance classes
# smote=SMOTE(sampling_strategy='minority')  # generating synthetic samples for minority class
# x,y=smote.fit_resample(x,y) # re-sampling

# ####### Creating a new training set with the new 100 rows (Class Majority 1) we augmented and the old training set

# # Re-combine balanced x and y into a dataframe
# new_train_combined_1 = pd.concat([x, y], axis = 1)

# # Re-naming the 'label_1_majority' label by its index 150, which creates a duplicate, & dropping the remaining 'label_1_majority'
# new_train_combined_1[150] = new_train_combined_1['label_1_majority']
# new_train_combined_1 = new_train_combined_1.drop(columns=['label_1_majority']) 

# # Sanity checks: making sure the right columns are there & the correct # of class values
# # print(new_train_combined_1[150].value_counts()) 
# # new_train_combined_1.head()

# # Subset the last 100 rows that were augmented in new_train_combined_0
# aug_1 = new_train_combined_1.tail(100)

# # Checking it concatenated properly
# # Check the shape of the original dataFrames
# # print("Shape of train_combined:", train_combined_PRE_FINAL.shape)
# # print("Shape of aug_0:", aug_1.shape)

# # Concatenating the DataFrames
# new_train_combined = pd.concat([train_combined_PRE_FINAL, aug_1], ignore_index=True)

# # Check the shape of the combined dataFrame
# # print("Shape of new_train_combined:", new_train_combined.shape)

# # Print the combined dataFrame
# # print(new_train_combined)

# ##################################### Classifier: XGBOOST #####################################

# # Prepping our new dataset by 'splitting' it
# X_train = new_train_combined.iloc[:,0:150]
# y_train = new_train_combined.iloc[:,-1]

# # First, put this prompt: "conda install -c conda-forge py-xgboost" in anaconda to download xgboost package
# # install xgboost in jupyter
# # !pip install xgboost

# # create model instance
# # n_estimators: number of trees(estimators) the model uses --> the more used, the more accurate the model is
# # max_depth: maximum depth of tree --> higher number makes model more complex, but too high can cause overfitting
# # learning_rate: quantifies each tree's contribution to total prediction --> lower number takes longer, but can lead to better generalization
# # objective: binary:logistic outputs probabilities. if classification is wanted, use binary:hinge
# bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic', enable_categorical=True)

# # fit model with the training data
# bst.fit(X_train, y_train)

# # make predictions for the test dataset
# preds = bst.predict(X_test)

# # print predictions
# # print(preds)

# # print model Accuracy (how often the classifier is correct)
# # print("Accuracy:",metrics.accuracy_score(y_test, preds))

# # Assign the accuracy directly to array
# skm.f1_score(y_test, preds)

# end_time = time.time()
# elapsed_time = end_time - start_time

# print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.30624890327453613 seconds


NameError: name 'result' is not defined