In [1]:
import pandas as pd  # import pandas library for data manipulation
from sklearn.naive_bayes import GaussianNB  # import Gaussian Naive Bayes model from scikit-learn
from sklearn.tree import DecisionTreeClassifier  # import Decision Tree Classifier from scikit-learn
from sklearn.metrics import f1_score, precision_score, recall_score  # import performance metrics from scikit-learn
from sklearn import preprocessing  # import preprocessing module for data preprocessing

def buildClassifiers(clf, X_train, X_test, y_train, y_test):
    # train the classifier with training data
    clf.fit(X_train, y_train)
    
    # make predictions on the test data
    y_pred = clf.predict(X_test)

    # evaluate the performance of the classifier using precision, recall, and f1 score
    evaluation_metrics = {
        'f1': f1_score(y_test, y_pred, average="micro"),
        'precision': precision_score(y_test, y_pred, average="micro"),
        'recall': recall_score(y_test, y_pred, average="micro")
    }
    return evaluation_metrics  # return the evaluation metrics as a dictionary


In [9]:
# load data from CSV files
X_train = pd.read_csv("X_train.csv")
X_validation = pd.read_csv("validation.csv")
X_test = pd.read_csv("test.csv")

y_train = pd.read_csv("first_label")
y_validation = pd.read_csv("validation_first_label")
y_test = pd.read_csv("test_first_label")

# reset indices to ensure data alignment after concatenation
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

# concatenate training and validation datasets for a larger training set
X_train = pd.concat([X_train, X_validation])
y_train = pd.concat([y_train, y_validation])

# create a copy of the combined training data for modifications without affecting the original data
df = X_train.copy()

# define classifier models for comparison
classifiers = {
    'Naive_Bayes': GaussianNB(),
    'Decision_Tree': DecisionTreeClassifier(random_state=0),
    # 'Random_Forest': RandomForestClassifier(random_state=0)  # uncomment to use
}

def build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test):
    # train classifier on the encoded training data
    clf.fit(X_train_enc, y_train)
    # predict using the trained classifier on the encoded test data
    y_pred = clf.predict(X_test_enc)

    # evaluate the classifier performance using f1, precision, and recall
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall  # return the performance metrics

# feature engineering: removing less relevant columns
for i in range(1, 8):
    column_name = "Column " + str(i)
    df[column_name] = "_"  # set the value to a placeholder to simulate removal
    print(f"Removing {column_name}")

    # apply ordinal encoding to handle categorical variables in a way that models can interpret
    le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    le.fit(df.values.ravel().reshape(-1, 1))  # fit the encoder to the full dataset to capture all categories

    # transform the training and test datasets to numerical format using the fitted encoder
    X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
    X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)

    for name, clf in classifiers.items():
        print('Now classifying', name)  # notify current classifier in use
        f1, precision, recall = build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test)  # build and evaluate classifiers

        # print evaluation scores for the current classifier
        print("\tAverage F1 for {}:\t\t".format(name), f1)
        print("\tAverage Precision for {}:\t".format(name), precision)
        print("\tAverage Recall for {}:\t\t".format(name), recall)


Removing Column 1


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 2


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 3


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 4


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38009904780723963
	Average Precision for Naive_Bayes:	 0.38009904780723963
	Average Recall for Naive_Bayes:		 0.38009904780723963
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7772330011156258
	Average Precision for Decision_Tree:	 0.7772330011156258
	Average Recall for Decision_Tree:		 0.7772330011156258
Removing Column 5


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.3800537641048441
	Average Precision for Naive_Bayes:	 0.3800537641048441
	Average Recall for Naive_Bayes:		 0.3800537641048441
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7762902767657557
	Average Precision for Decision_Tree:	 0.7762902767657557
	Average Recall for Decision_Tree:		 0.7762902767657557
Removing Column 6


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.3789010880438676
	Average Precision for Naive_Bayes:	 0.3789010880438676
	Average Recall for Naive_Bayes:		 0.3789010880438676
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7843631258928093
	Average Precision for Decision_Tree:	 0.7843631258928093
	Average Recall for Decision_Tree:		 0.7843631258928093
Removing Column 7
Now classifying Naive_Bayes


  y = column_or_1d(y, warn=True)


	Average F1 for Naive_Bayes:		 0.08410830214932918
	Average Precision for Naive_Bayes:	 0.08410830214932918
	Average Recall for Naive_Bayes:		 0.08410830214932918
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.354126786133307
	Average Precision for Decision_Tree:	 0.354126786133307
	Average Recall for Decision_Tree:		 0.354126786133307


In [6]:
# load data from CSV files
X_train = pd.read_csv("X_train.csv")
X_validation = pd.read_csv("validation.csv")
X_test = pd.read_csv("test.csv")

y_train = pd.read_csv("first_label")
y_validation = pd.read_csv("validation_first_label")
y_test = pd.read_csv("test_first_label")

# reset indices to ensure data alignment after concatenation
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

# concatenate training and validation datasets for a larger training set
X_train = pd.concat([X_train, X_validation])
y_train = pd.concat([y_train, y_validation])

# create a copy of the combined training data for modifications without affecting the original data
df = X_train.copy()

# define classifier models for comparison
classifiers = {
    'Naive_Bayes': GaussianNB(),
    'Decision_Tree': DecisionTreeClassifier(random_state=0),
    # 'Random_Forest': RandomForestClassifier(random_state=0)  # uncomment to use
}

def build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test):
    # train classifier on the encoded training data
    clf.fit(X_train_enc, y_train)
    # predict using the trained classifier on the encoded test data
    y_pred = clf.predict(X_test_enc)

    # evaluate the classifier performance using f1, precision, and recall
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall  # return the performance metrics

# feature engineering: removing less relevant columns
for i in range(1, 10):
    column_name = "Column " + str(i)
    df[column_name] = "_"  # set the value to a placeholder to simulate removal
    print(f"Removing {column_name}")

    # apply ordinal encoding to handle categorical variables in a way that models can interpret
    le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    le.fit(df.values.ravel().reshape(-1, 1))  # fit the encoder to the full dataset to capture all categories

    # transform the training and test datasets to numerical format using the fitted encoder
    X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
    X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)

    for name, clf in classifiers.items():
        print('Now classifying', name)  # notify current classifier in use
        f1, precision, recall = build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test)  # build and evaluate classifiers

        # print evaluation scores for the current classifier
        print("\tAverage F1 for {}:\t\t".format(name), f1)
        print("\tAverage Precision for {}:\t".format(name), precision)
        print("\tAverage Recall for {}:\t\t".format(name), recall)


Removing Column 1


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 2


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 3


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 4


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38009904780723963
	Average Precision for Naive_Bayes:	 0.38009904780723963
	Average Recall for Naive_Bayes:		 0.38009904780723963
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7772330011156258
	Average Precision for Decision_Tree:	 0.7772330011156258
	Average Recall for Decision_Tree:		 0.7772330011156258
Removing Column 5


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.3800537641048441
	Average Precision for Naive_Bayes:	 0.3800537641048441
	Average Recall for Naive_Bayes:		 0.3800537641048441
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7762902767657557
	Average Precision for Decision_Tree:	 0.7762902767657557
	Average Recall for Decision_Tree:		 0.7762902767657557
Removing Column 6


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.3789010880438676
	Average Precision for Naive_Bayes:	 0.3789010880438676
	Average Recall for Naive_Bayes:		 0.3789010880438676
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7843631258928093
	Average Precision for Decision_Tree:	 0.7843631258928093
	Average Recall for Decision_Tree:		 0.7843631258928093
Removing Column 7
Now classifying Naive_Bayes


  y = column_or_1d(y, warn=True)


	Average F1 for Naive_Bayes:		 0.08410830214932918
	Average Precision for Naive_Bayes:	 0.08410830214932918
	Average Recall for Naive_Bayes:		 0.08410830214932918
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.354126786133307
	Average Precision for Decision_Tree:	 0.354126786133307
	Average Recall for Decision_Tree:		 0.354126786133307
Removing Column 8
Now classifying Naive_Bayes


  y = column_or_1d(y, warn=True)


	Average F1 for Naive_Bayes:		 0.08410830214932918
	Average Precision for Naive_Bayes:	 0.08410830214932918
	Average Recall for Naive_Bayes:		 0.08410830214932918
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.354126786133307
	Average Precision for Decision_Tree:	 0.354126786133307
	Average Recall for Decision_Tree:		 0.354126786133307
Removing Column 9
Now classifying Naive_Bayes


  y = column_or_1d(y, warn=True)


	Average F1 for Naive_Bayes:		 0.08410830214932918
	Average Precision for Naive_Bayes:	 0.08410830214932918
	Average Recall for Naive_Bayes:		 0.08410830214932918
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.354126786133307
	Average Precision for Decision_Tree:	 0.354126786133307
	Average Recall for Decision_Tree:		 0.354126786133307


In [7]:
# load data from CSV files
X_train = pd.read_csv("X_train.csv")
X_validation = pd.read_csv("validation.csv")
X_test = pd.read_csv("test.csv")

y_train = pd.read_csv("first_label")
y_validation = pd.read_csv("validation_first_label")
y_test = pd.read_csv("test_first_label")

# reset indices to ensure data alignment after concatenation
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

# concatenate training and validation datasets for a larger training set
X_train = pd.concat([X_train, X_validation])
y_train = pd.concat([y_train, y_validation])

# create a copy of the combined training data for modifications without affecting the original data
df = X_train.copy()

# define classifier models for comparison
classifiers = {
    'Naive_Bayes': GaussianNB(),
    'Decision_Tree': DecisionTreeClassifier(random_state=0),
    # 'Random_Forest': RandomForestClassifier(random_state=0)  # uncomment to use
}

def build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test):
    # train classifier on the encoded training data
    clf.fit(X_train_enc, y_train)
    # predict using the trained classifier on the encoded test data
    y_pred = clf.predict(X_test_enc)

    # evaluate the classifier performance using f1, precision, and recall
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall  # return the performance metrics

# feature engineering: removing less relevant columns
for i in range(1, 6):
    column_name = "Column " + str(i)
    df[column_name] = "_"  # set the value to a placeholder to simulate removal
    print(f"Removing {column_name}")

    # apply ordinal encoding to handle categorical variables in a way that models can interpret
    le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    le.fit(df.values.ravel().reshape(-1, 1))  # fit the encoder to the full dataset to capture all categories

    # transform the training and test datasets to numerical format using the fitted encoder
    X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
    X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)

    for name, clf in classifiers.items():
        print('Now classifying', name)  # notify current classifier in use
        f1, precision, recall = build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test)  # build and evaluate classifiers

        # print evaluation scores for the current classifier
        print("\tAverage F1 for {}:\t\t".format(name), f1)
        print("\tAverage Precision for {}:\t".format(name), precision)
        print("\tAverage Recall for {}:\t\t".format(name), recall)


Removing Column 1


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 2


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 3


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 4


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38009904780723963
	Average Precision for Naive_Bayes:	 0.38009904780723963
	Average Recall for Naive_Bayes:		 0.38009904780723963
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7772330011156258
	Average Precision for Decision_Tree:	 0.7772330011156258
	Average Recall for Decision_Tree:		 0.7772330011156258
Removing Column 5


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.3800537641048441
	Average Precision for Naive_Bayes:	 0.3800537641048441
	Average Recall for Naive_Bayes:		 0.3800537641048441
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7762902767657557
	Average Precision for Decision_Tree:	 0.7762902767657557
	Average Recall for Decision_Tree:		 0.7762902767657557


In [8]:
# load data from CSV files
X_train = pd.read_csv("X_train.csv")
X_validation = pd.read_csv("validation.csv")
X_test = pd.read_csv("test.csv")

y_train = pd.read_csv("first_label")
y_validation = pd.read_csv("validation_first_label")
y_test = pd.read_csv("test_first_label")

# reset indices to ensure data alignment after concatenation
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

# concatenate training and validation datasets for a larger training set
X_train = pd.concat([X_train, X_validation])
y_train = pd.concat([y_train, y_validation])

# create a copy of the combined training data for modifications without affecting the original data
df = X_train.copy()

# define classifier models for comparison
classifiers = {
    'Naive_Bayes': GaussianNB(),
    'Decision_Tree': DecisionTreeClassifier(random_state=0),
    # 'Random_Forest': RandomForestClassifier(random_state=0)  # uncomment to use
}

def build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test):
    # train classifier on the encoded training data
    clf.fit(X_train_enc, y_train)
    # predict using the trained classifier on the encoded test data
    y_pred = clf.predict(X_test_enc)

    # evaluate the classifier performance using f1, precision, and recall
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall  # return the performance metrics

# feature engineering: removing less relevant columns
for i in range(1, 4):
    column_name = "Column " + str(i)
    df[column_name] = "_"  # set the value to a placeholder to simulate removal
    print(f"Removing {column_name}")

    # apply ordinal encoding to handle categorical variables in a way that models can interpret
    le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    le.fit(df.values.ravel().reshape(-1, 1))  # fit the encoder to the full dataset to capture all categories

    # transform the training and test datasets to numerical format using the fitted encoder
    X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
    X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)

    for name, clf in classifiers.items():
        print('Now classifying', name)  # notify current classifier in use
        f1, precision, recall = build_classifiers(clf, X_train_enc, X_test_enc, y_train, y_test)  # build and evaluate classifiers

        # print evaluation scores for the current classifier
        print("\tAverage F1 for {}:\t\t".format(name), f1)
        print("\tAverage Precision for {}:\t".format(name), precision)
        print("\tAverage Recall for {}:\t\t".format(name), recall)


Removing Column 1


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 2


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
Removing Column 3


  X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1)).squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1)).squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38020608201290174
	Average Precision for Naive_Bayes:	 0.38020608201290174
	Average Recall for Naive_Bayes:		 0.38020608201290174
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7557767595805905
	Average Precision for Decision_Tree:	 0.7557767595805905
	Average Recall for Decision_Tree:		 0.7557767595805905
