In [1]:
import os
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
import json

2024-11-11 13:22:17.466765: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


# Training Tokenization and Training Data

In [3]:
with open('/pixel_document_train_cifar10.txt', 'r') as file:
    pixel_document = file.read()
    print(type(pixel_document))

<class 'str'>


In [4]:
document_set = []
for i in range(0, len(pixel_document), 32):
  chunk = pixel_document[i:i+32]
  document_set.append(chunk)

In [5]:
document_set[:16]

['FXXF2m333mmmmmmmmmm33m3mmIIIIINI',
 '1Ar22mm222K22333mmK2333333333KK2',
 '1r22KKK22mm222222222mmmm33222FXX',
 '22333mKK33K2FF11KZr112m2333222Kr',
 '3K33KK33K3m111rrNN32rrr123333K33',
 'I33333mm3mrK2K333Kq3qMq223333333',
 'q3333mF222KM44iM555MMMMqK3333333',
 '333331F22qMI35z6886iiz68iq333333',
 '33333r2KM89qMii6hzzG549h6433333K',
 '32K33KZb8h44zzUII3qI45i66Im222m3',
 '323mm3UGz94344333I333IUUUIKK33Kq',
 '3K33333344455M33333333mmm3333345',
 'qK333333444UGU943K3332kKK3333qMI',
 'M43KKm3333mK3I33333333333333M5IF',
 '5444qq333333333333333Km333mFI411',
 '4444MMMMqqqqqq3qq33333KKKKrLqMKK']

In [6]:
with open('ImageNet_ListVocabs_.json', 'r') as file:
    list_of_vocabs = json.load(file)
list_of_vocabs[10]

'2222m'

In [8]:
len(list_of_vocabs)

20736376

In [7]:
def build_trie(priority_lists):
    trie = {}
    for word in priority_lists:
        node = trie
        for char in word:
            node = node.setdefault(char, {})
        node['$'] = word
    return trie

def match_and_extract(text, trie):
    result = []
    i = 0
    while i < len(text):
        node = trie
        end = i
        last_match = None
        for j in range(i, len(text)):
            if text[j] not in node:
                break
            node = node[text[j]]
            if '$' in node:
                last_match = node['$']
                end = j + 1
        if last_match:
            result.append(last_match)
            i = end
        else:
            i += 1
    return result

def preprocess_vocabs(list_of_vocabs):
    trie = build_trie(list_of_vocabs)
    vocab_dict = defaultdict(list)
    for word in list_of_vocabs:
        if word:
            vocab_dict[word[0]].append(word)
    return trie, vocab_dict

def tokenize_documents(document_set, list_of_vocabs):
    trie, vocab_dict = preprocess_vocabs(list_of_vocabs)
    tokenized_documents = []

    for document in tqdm(document_set):
        tokenized_documents.append(match_and_extract(document, trie))

    return tokenized_documents

# Usage
tokenized_documents = tokenize_documents(document_set, list_of_vocabs)

100%|██████████| 800000/800000 [00:07<00:00, 106979.15it/s]


In [9]:
len(tokenized_documents)

800000

In [10]:
document = []
for i in range(0, len(tokenized_documents), 16):
  group = tokenized_documents[i:i+16]
  tokens = ''
  for j in group:
    tokens = ' '.join(j)
    document.append(tokens)

training_document = []
for i in range(0, len(document), 16):
  group = document[i:i+16]
  tokens = ' '.join(group)
  training_document.append(tokens)

In [11]:
document[:16]

['FXXF2 m333m mmmmm mmmm3 3m3mm IIIII NI',
 '1Ar22 mm222 K2233 3mmK2 3333K K2',
 '1r22K KK22m m2222 2222m mmm33 222FX',
 '22333 mKK33 K2FF1 1KZr1 12m23 33222 Kr',
 '3K33K K33K3 m111r rNN32 rrr12 3333K',
 'I3333 3mm3m rK2K3 33Kq3 qMq22',
 'q3333 mF222 KM44i M555M MMMqK',
 '33331 F22qM I35z6 886ii z68iq',
 '3333r 2KM89 qMii6 hzzG5 49h64 3333K',
 '32K33 KZb8h 44zzU II3qI 45i66 Im222 m3',
 '323mm 3UGz9 43443 33I33 3IUUU IKK33 Kq',
 '3K333 33344 455M3 3333m mm333 3345',
 'qK333 33344 4UGU9 43K33 32kKK 3333q MI',
 'M43KK m3333 mK3I3 3333M 5IF',
 '5444q q3333 3333K m333m FI411',
 '4444M MMMqq qqqq3 qq333 33KKK KrLqM KK']

In [12]:
training_document[0]

'FXXF2 m333m mmmmm mmmm3 3m3mm IIIII NI 1Ar22 mm222 K2233 3mmK2 3333K K2 1r22K KK22m m2222 2222m mmm33 222FX 22333 mKK33 K2FF1 1KZr1 12m23 33222 Kr 3K33K K33K3 m111r rNN32 rrr12 3333K I3333 3mm3m rK2K3 33Kq3 qMq22 q3333 mF222 KM44i M555M MMMqK 33331 F22qM I35z6 886ii z68iq 3333r 2KM89 qMii6 hzzG5 49h64 3333K 32K33 KZb8h 44zzU II3qI 45i66 Im222 m3 323mm 3UGz9 43443 33I33 3IUUU IKK33 Kq 3K333 33344 455M3 3333m mm333 3345 qK333 33344 4UGU9 43K33 32kKK 3333q MI M43KK m3333 mK3I3 3333M 5IF 5444q q3333 3333K m333m FI411 4444M MMMqq qqqq3 qq333 33KKK KrLqM KK'

In [13]:
len(training_document)

50000

# Testing Tokenization and Testing Data

In [15]:
with open('/pixel_document_test_cifar10.txt', 'r') as file:
    result_string_test = file.read()
    print(type(result_string_test))

<class 'str'>


In [16]:
test_document_set = []
for i in range(0, len(result_string_test), 32):
  chunk = result_string_test[i:i+32]
  test_document_set.append(chunk)

In [17]:
test_document_set[:16]

['33333333333333333333333333333332',
 '3333333333qqq33333mmm222mmm33333',
 '3333333333IGI3mg22233333K222m333',
 '3312333333333qK2K22I43333221Fm33',
 'MqKq4333Mblq44322222343K22221Fm3',
 '5345M3q3Uz545M33K21Ki44m23K221r2',
 '5345II43244q56Mq33r4543222mKK222',
 '5q55q234K3qif94II32494Im2m333mK3',
 '6M555K34qKIUII33m3qINIIm22m3K333',
 'z555542II4q233qMKmI3qmFF12K3q333',
 'I55554m4fuZZZl55MqK2F1rr22333mmm',
 'm9955Ml87hGNNNNNm2222222222mm222',
 'K2mI94i7YN22222222222222K3333KK3',
 '32222I6hIK222222222222mmm22233mm',
 '322222mUm222mmK22K2222222KK33212',
 '33222222222222FF2m333222334Im222']

In [18]:
len(test_document_set)

160000

In [19]:
test_tokenized_documents = tokenize_documents(test_document_set, list_of_vocabs)

100%|██████████| 160000/160000 [00:01<00:00, 108237.07it/s]


In [20]:
document = []
for i in range(0, len(test_tokenized_documents), 16):
  group = test_tokenized_documents[i:i+16]
  tokens = ''
  for j in group:
    tokens = ' '.join(j)
    document.append(tokens)

testing_document = []
for i in range(0, len(document), 16):
  group = document[i:i+16]
  tokens = ' '.join(group)
  testing_document.append(tokens)

In [21]:
document[:16]

['33332',
 '3333q qq333 33mmm 222mm m3333',
 '3333I GI3mg 22233 333K2 22m33',
 '33123 3333q K2K22 I4333 3221F m33',
 'MqKq4 333Mb lq443 22223 43K22 221Fm',
 '5345M 3q3Uz 545M3 3K21K i44m2 3K221 r2',
 '5345I I4324 4q56M q33r4 54322 2mKK2',
 '5q55q 234K3 qif94 II324 94Im2 m333m K3',
 '6M555 K34qK IUII3 3m3qI NIIm2 2m3K3',
 'z5555 42II4 q233q MKmI3 qmFF1 2K3q3',
 'I5555 4m4f uZZZl 55MqK 2F1rr 22333 mmm',
 'm9955 Ml87h GNNNN Nm222 2222m m222',
 'K2mI9 4i7YN 2222K 3333K K3',
 '32222 I6hI K2222 2222m mm222 33mm',
 '32222 2mUm2 22mmK 22K22 2222K K3321',
 '33222 2222F F2m33 32223 34Im2']

In [22]:
testing_document[0]

'33332 3333q qq333 33mmm 222mm m3333 3333I GI3mg 22233 333K2 22m33 33123 3333q K2K22 I4333 3221F m33 MqKq4 333Mb lq443 22223 43K22 221Fm 5345M 3q3Uz 545M3 3K21K i44m2 3K221 r2 5345I I4324 4q56M q33r4 54322 2mKK2 5q55q 234K3 qif94 II324 94Im2 m333m K3 6M555 K34qK IUII3 3m3qI NIIm2 2m3K3 z5555 42II4 q233q MKmI3 qmFF1 2K3q3 I5555 4m4f uZZZl 55MqK 2F1rr 22333 mmm m9955 Ml87h GNNNN Nm222 2222m m222 K2mI9 4i7YN 2222K 3333K K3 32222 I6hI K2222 2222m mm222 33mm 32222 2mUm2 22mmK 22K22 2222K K3321 33222 2222F F2m33 32223 34Im2'

In [23]:
len(testing_document)

10000

In [24]:
with open('training_document_cifar10_imagenet.json', 'w') as file:
    json.dump(training_document, file)
with open('testing_document_cifar10_imagenet.json', 'w') as file:
    json.dump(testing_document, file)

# Classification

In [26]:
y_train = np.load('/Cifar 10/y_train_cifar10.npy')

In [27]:
y_test = np.load('/y_test_cifar10.npy')

In [28]:
y_train = y_train.tolist()
y_train = [item for sublist in y_train for item in sublist]
y_train

[6,
 9,
 9,
 4,
 1,
 1,
 2,
 7,
 8,
 3,
 4,
 7,
 7,
 2,
 9,
 9,
 9,
 3,
 2,
 6,
 4,
 3,
 6,
 6,
 2,
 6,
 3,
 5,
 4,
 0,
 0,
 9,
 1,
 3,
 4,
 0,
 3,
 7,
 3,
 3,
 5,
 2,
 2,
 7,
 1,
 1,
 1,
 2,
 2,
 0,
 9,
 5,
 7,
 9,
 2,
 2,
 5,
 2,
 4,
 3,
 1,
 1,
 8,
 2,
 1,
 1,
 4,
 9,
 7,
 8,
 5,
 9,
 6,
 7,
 3,
 1,
 9,
 0,
 3,
 1,
 3,
 5,
 4,
 5,
 7,
 7,
 4,
 7,
 9,
 4,
 2,
 3,
 8,
 0,
 1,
 6,
 1,
 1,
 4,
 1,
 8,
 3,
 9,
 6,
 6,
 1,
 8,
 5,
 2,
 9,
 9,
 8,
 1,
 7,
 7,
 0,
 0,
 6,
 9,
 1,
 2,
 2,
 9,
 2,
 6,
 6,
 1,
 9,
 5,
 0,
 4,
 7,
 6,
 7,
 1,
 8,
 1,
 1,
 2,
 8,
 1,
 3,
 3,
 6,
 2,
 4,
 9,
 9,
 5,
 4,
 3,
 6,
 7,
 4,
 6,
 8,
 5,
 5,
 4,
 3,
 1,
 8,
 4,
 7,
 6,
 0,
 9,
 5,
 1,
 3,
 8,
 2,
 7,
 5,
 3,
 4,
 1,
 5,
 7,
 0,
 4,
 7,
 5,
 5,
 1,
 0,
 9,
 6,
 9,
 0,
 8,
 7,
 8,
 8,
 2,
 5,
 2,
 3,
 5,
 0,
 6,
 1,
 9,
 3,
 6,
 9,
 1,
 3,
 9,
 6,
 6,
 7,
 1,
 0,
 9,
 5,
 8,
 5,
 2,
 9,
 0,
 8,
 8,
 0,
 6,
 9,
 1,
 1,
 6,
 3,
 7,
 6,
 6,
 0,
 6,
 6,
 1,
 7,
 1,
 5,
 8,
 3,
 6,
 6,
 8,
 6,
 8,
 4,
 6,
 6,


In [29]:
y_test = y_test.tolist()
y_test = [item for sublist in y_test for item in sublist]
y_test

[3,
 8,
 8,
 0,
 6,
 6,
 1,
 6,
 3,
 1,
 0,
 9,
 5,
 7,
 9,
 8,
 5,
 7,
 8,
 6,
 7,
 0,
 4,
 9,
 5,
 2,
 4,
 0,
 9,
 6,
 6,
 5,
 4,
 5,
 9,
 2,
 4,
 1,
 9,
 5,
 4,
 6,
 5,
 6,
 0,
 9,
 3,
 9,
 7,
 6,
 9,
 8,
 0,
 3,
 8,
 8,
 7,
 7,
 4,
 6,
 7,
 3,
 6,
 3,
 6,
 2,
 1,
 2,
 3,
 7,
 2,
 6,
 8,
 8,
 0,
 2,
 9,
 3,
 3,
 8,
 8,
 1,
 1,
 7,
 2,
 5,
 2,
 7,
 8,
 9,
 0,
 3,
 8,
 6,
 4,
 6,
 6,
 0,
 0,
 7,
 4,
 5,
 6,
 3,
 1,
 1,
 3,
 6,
 8,
 7,
 4,
 0,
 6,
 2,
 1,
 3,
 0,
 4,
 2,
 7,
 8,
 3,
 1,
 2,
 8,
 0,
 8,
 3,
 5,
 2,
 4,
 1,
 8,
 9,
 1,
 2,
 9,
 7,
 2,
 9,
 6,
 5,
 6,
 3,
 8,
 7,
 6,
 2,
 5,
 2,
 8,
 9,
 6,
 0,
 0,
 5,
 2,
 9,
 5,
 4,
 2,
 1,
 6,
 6,
 8,
 4,
 8,
 4,
 5,
 0,
 9,
 9,
 9,
 8,
 9,
 9,
 3,
 7,
 5,
 0,
 0,
 5,
 2,
 2,
 3,
 8,
 6,
 3,
 4,
 0,
 5,
 8,
 0,
 1,
 7,
 2,
 8,
 8,
 7,
 8,
 5,
 1,
 8,
 7,
 1,
 3,
 0,
 5,
 7,
 9,
 7,
 4,
 5,
 9,
 8,
 0,
 7,
 9,
 8,
 2,
 7,
 6,
 9,
 4,
 3,
 9,
 6,
 4,
 7,
 6,
 5,
 1,
 5,
 8,
 8,
 0,
 4,
 0,
 5,
 5,
 1,
 1,
 8,
 9,
 0,
 3,
 1,
 9,
 2,
 2,


In [30]:
unique_elements = set(y_train)
unique_elements

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

## Feature-Based Naive Bayes

In [31]:
from collections import defaultdict
import math

class NaiveBayes:
    def __init__(self):
        self.class_counts = defaultdict(int)
        self.feature_counts = defaultdict(lambda: defaultdict(int))
        self.vocab = set()

    def train(self, documents, labels):
        for doc, label in zip(documents, labels):
            self.class_counts[label] += 1
            features = self.extract_features(doc)
            for feature in features:
                self.feature_counts[label][feature] += 1
                self.vocab.add(feature)

    def extract_features(self, document):
        return document.split()

    def predict(self, document):
        features = self.extract_features(document)
        best_label = None
        best_score = float('-inf')

        for label in self.class_counts:
            score = math.log(self.class_counts[label])
            for feature in features:
                if feature in self.vocab:
                    score += math.log((self.feature_counts[label][feature] + 1) /
                                      (self.class_counts[label] + len(self.vocab)))
            if score > best_score:
                best_score = score
                best_label = label

        return best_label

    def evaluate(self, test_documents, test_labels):
        predictions = [self.predict(doc) for doc in test_documents]

        # Calculate confusion matrix
        confusion_matrix = defaultdict(lambda: defaultdict(int))
        for true_label, pred_label in zip(test_labels, predictions):
            confusion_matrix[true_label][pred_label] += 1

        # Calculate metrics
        accuracy = sum(1 for true, pred in zip(test_labels, predictions) if true == pred) / len(test_labels)

        metrics = {}
        for label in set(test_labels):
            tp = confusion_matrix[label][label]
            fp = sum(confusion_matrix[other][label] for other in confusion_matrix if other != label)
            fn = sum(confusion_matrix[label][other] for other in confusion_matrix[label] if other != label)

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            metrics[label] = {
                'precision': precision,
                'recall': recall,
                'f1': f1
            }

        # Calculate macro-averaged metrics
        macro_precision = sum(m['precision'] for m in metrics.values()) / len(metrics)
        macro_recall = sum(m['recall'] for m in metrics.values()) / len(metrics)
        macro_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics)

        return {
            'accuracy': accuracy,
            'class_metrics': metrics,
            'macro_precision': macro_precision,
            'macro_recall': macro_recall,
            'macro_f1': macro_f1
        }


classifier = NaiveBayes()
classifier.train(training_document, y_train)

evaluation_results = classifier.evaluate(testing_document, y_test)

print("Evaluation Results:")
print(f"Accuracy: {evaluation_results['accuracy']:.2f}")
print("\nClass-wise Metrics:")
for label, metrics in evaluation_results['class_metrics'].items():
    print(f"  {label}:")
    print(f"    Precision: {metrics['precision']:.2f}")
    print(f"    Recall: {metrics['recall']:.2f}")
    print(f"    F1-score: {metrics['f1']:.2f}")
print("\nMacro-averaged Metrics:")
print(f"Precision: {evaluation_results['macro_precision']:.2f}")
print(f"Recall: {evaluation_results['macro_recall']:.2f}")
print(f"F1-score: {evaluation_results['macro_f1']:.2f}")

Evaluation Results:
Accuracy: 0.30

Class-wise Metrics:
  0:
    Precision: 0.38
    Recall: 0.23
    F1-score: 0.28
  1:
    Precision: 0.41
    Recall: 0.35
    F1-score: 0.38
  2:
    Precision: 0.26
    Recall: 0.10
    F1-score: 0.14
  3:
    Precision: 0.22
    Recall: 0.23
    F1-score: 0.22
  4:
    Precision: 0.20
    Recall: 0.56
    F1-score: 0.30
  5:
    Precision: 0.34
    Recall: 0.27
    F1-score: 0.30
  6:
    Precision: 0.34
    Recall: 0.42
    F1-score: 0.37
  7:
    Precision: 0.43
    Recall: 0.18
    F1-score: 0.25
  8:
    Precision: 0.39
    Recall: 0.26
    F1-score: 0.31
  9:
    Precision: 0.35
    Recall: 0.40
    F1-score: 0.37

Macro-averaged Metrics:
Precision: 0.33
Recall: 0.30
F1-score: 0.29


## TF-IDF

In [32]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


# Step 3: Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Step 4: Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(training_document)

# Step 5: Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(testing_document)

### Naive Bayes

In [33]:
# Step 6: Create and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Step 7: Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.27

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.15      0.23      1000
           1       0.48      0.20      0.28      1000
           2       0.22      0.12      0.16      1000
           3       0.23      0.23      0.23      1000
           4       0.17      0.72      0.28      1000
           5       0.37      0.23      0.28      1000
           6       0.49      0.22      0.30      1000
           7       0.56      0.07      0.12      1000
           8       0.29      0.46      0.35      1000
           9       0.34      0.28      0.31      1000

    accuracy                           0.27     10000
   macro avg       0.36      0.27      0.25     10000
weighted avg       0.36      0.27      0.25     10000



### SVM-linear

In [36]:
from sklearn.svm import SVC

# Step 6: Create and train the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

# Step 7: Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.36

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.47      0.41      1000
           1       0.41      0.42      0.42      1000
           2       0.22      0.22      0.22      1000
           3       0.23      0.22      0.22      1000
           4       0.35      0.35      0.35      1000
           5       0.33      0.31      0.32      1000
           6       0.43      0.48      0.45      1000
           7       0.45      0.32      0.37      1000
           8       0.40      0.40      0.40      1000
           9       0.43      0.40      0.41      1000

    accuracy                           0.36     10000
   macro avg       0.36      0.36      0.36     10000
weighted avg       0.36      0.36      0.36     10000



### SVM-rbf

In [35]:
from sklearn.svm import SVC

# Step 6: Create and train the SVM classifier
svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

# Step 7: Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.36

Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.44      0.41      1000
           1       0.39      0.40      0.40      1000
           2       0.27      0.23      0.25      1000
           3       0.26      0.19      0.22      1000
           4       0.34      0.34      0.34      1000
           5       0.34      0.34      0.34      1000
           6       0.37      0.52      0.44      1000
           7       0.39      0.32      0.35      1000
           8       0.44      0.41      0.42      1000
           9       0.40      0.42      0.41      1000

    accuracy                           0.36     10000
   macro avg       0.36      0.36      0.36     10000
weighted avg       0.36      0.36      0.36     10000



### Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Step 7: Make predictions on the test set
y_pred = lr_model.predict(X_test_tfidf)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.36

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.44      0.41      1000
           1       0.42      0.41      0.42      1000
           2       0.24      0.20      0.21      1000
           3       0.23      0.18      0.20      1000
           4       0.34      0.36      0.35      1000
           5       0.34      0.34      0.34      1000
           6       0.39      0.52      0.45      1000
           7       0.41      0.33      0.37      1000
           8       0.40      0.43      0.42      1000
           9       0.40      0.40      0.40      1000

    accuracy                           0.36     10000
   macro avg       0.36      0.36      0.36     10000
weighted avg       0.36      0.36      0.36     10000



### MLP

In [37]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler


# Step 6: Scale the features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_tfidf)
X_test_scaled = scaler.transform(X_test_tfidf)

# Step 7: Create and train the MLP classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 50),
                               max_iter=500,
                               activation='relu',
                               solver='adam',
                               random_state=42)
mlp_classifier.fit(X_train_scaled, y_train)

# Step 8: Make predictions on the test set
y_pred = mlp_classifier.predict(X_test_scaled)

# Step 9: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.23

Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.16      0.20      1000
           1       0.29      0.18      0.22      1000
           2       0.16      0.12      0.14      1000
           3       0.19      0.22      0.21      1000
           4       0.22      0.45      0.29      1000
           5       0.23      0.26      0.25      1000
           6       0.31      0.23      0.27      1000
           7       0.20      0.18      0.19      1000
           8       0.31      0.25      0.27      1000
           9       0.23      0.28      0.25      1000

    accuracy                           0.23     10000
   macro avg       0.24      0.23      0.23     10000
weighted avg       0.24      0.23      0.23     10000



### Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)

# Step 7: Make predictions on the test set
y_pred = rf_classifier.predict(X_test_tfidf)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.28

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.60      0.35      1000
           1       0.33      0.32      0.32      1000
           2       0.22      0.26      0.24      1000
           3       0.23      0.14      0.17      1000
           4       0.29      0.26      0.27      1000
           5       0.28      0.12      0.17      1000
           6       0.37      0.36      0.36      1000
           7       0.29      0.11      0.16      1000
           8       0.29      0.36      0.32      1000
           9       0.32      0.29      0.31      1000

    accuracy                           0.28     10000
   macro avg       0.29      0.28      0.27     10000
weighted avg       0.29      0.28      0.27     10000



### XGBoost

In [39]:
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_tfidf, y_train)

y_pred = xgb_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.32

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.48      0.40      1000
           1       0.38      0.33      0.35      1000
           2       0.24      0.23      0.24      1000
           3       0.22      0.17      0.19      1000
           4       0.27      0.29      0.28      1000
           5       0.30      0.28      0.29      1000
           6       0.36      0.43      0.39      1000
           7       0.34      0.23      0.27      1000
           8       0.35      0.42      0.38      1000
           9       0.36      0.33      0.34      1000

    accuracy                           0.32     10000
   macro avg       0.32      0.32      0.31     10000
weighted avg       0.32      0.32      0.31     10000

