In [1]:
import ast
from collections import Counter
import json
from scipy import sparse

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder

# Reading data

In [2]:
train_data = pd.read_csv("../data/train_data_cleaning.csv")
train_data['content_ready'] = train_data['content_ready'].apply(ast.literal_eval)

In [3]:
test_data = pd.read_csv("../data/test_data_cleaning.csv")
test_data['content_ready'] = test_data['content_ready'].apply(ast.literal_eval)

In [4]:
with open('../data/vocabulary.txt', 'r') as file:
    content = file.read()
    vocabulary = content.split('\n')

In [5]:
X_train_tfidf = sparse.load_npz('../data/X_train_tfidf.npz')
X_test_tfidf = sparse.load_npz('../data/X_test_tfidf.npz')

In [6]:
x_train_w2v = pd.read_csv('../data/x_train_w2v.csv')
x_test_w2v = pd.read_csv('../data/x_test_w2v.csv')

In [7]:
vectorized_train_data = np.load('../data/vectorized_train_data.npz')['data']
vectorized_test_data = np.load('../data/vectorized_test_data.npz')['data']

In [8]:
results = {}

# Naive bayes in hand

## Transforming data into numerical features using Bag-of-words

In [9]:
def create_frequency_table(documents, vocabulary) -> pd.DataFrame:
    """Create the bag of words matrix based on the list of documents.

    Args:
        docs_list (pd.Series): A series of list of words, where each row represent a document, 
        which contain a list of words.
        vocabulary (list): --
        is_list (bool, optional)
    Returns:
        pd.DataFrame: The bag of words table which contain the frequency of each word in each document.
    """
    
    # get all rows(documents) to a list of list instead of series
    documents = documents.tolist()

    # create a dictionary of vocabulary words with its indexes
    word_index = {word: i for i, word in enumerate(vocabulary)}

    # create the bag of words matrix in shape of
    # [number of documents, size of vocabulary]
    vocab_size = len(vocabulary)
    bow_matrix = [[0] * vocab_size for _ in range(len(documents))]

    # calculate the frequency for each word in each document
    for doc_idx, doc in enumerate(documents):
        for word in doc:
            if word in word_index:
                word_idx = word_index[word]
                bow_matrix[doc_idx][word_idx] += 1

    # turn the matrix to df for ease display
    df_bow = pd.DataFrame(bow_matrix, columns=word_index.keys())
    
    return df_bow

bow = create_frequency_table(train_data['content_ready'], vocabulary)
# 3 min to 4 min

In [10]:
# example of words occures in the first document 
bow.iloc[0][bow.iloc[0] > 0] 

stephen       1
decline       1
may           1
given         1
12            1
             ..
added         2
security      1
dropping      1
indirectly    1
deposit       1
Name: 0, Length: 231, dtype: int64

## Naive Bayes Algorithm

### Calculate Prior Probabilities
 For each class, calculate the prior probability of the class, which is the number of documents in the class divided by the total number of documents.

In [11]:
# add category class to each document
bow.insert(len(bow.columns), "class_category", train_data['category'].values)

bow

Unnamed: 0,191.15,turner,overbuilding,646.7,barometer,aprjun,943938,3.9375,francophone,scis,...,mt,13d,purchasing,icgs.l,nowcollapsed,soyfood,luxury,82.36,Unnamed: 20,class_category
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,interest
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,money-fx
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sun-oil
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,corn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,earn
9132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,lumber
9133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,corn
9134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,grain


In [12]:
def calculate_prior_and_bigdoc(documents, classes) -> (dict, dict):
    # Initialize the count for each category and bigDoc structure
    category_counts = classes.value_counts()
    bigDoc = {}
    
    for document, category in zip(documents, classes):
        # Append document to the correct category in bigDoc
        if category in bigDoc:
            bigDoc[category].extend(document)
        else:
            bigDoc[category] = document
    
    # Calculate the prior probability for each category
    num_documents = len(documents)
    prior = {category: np.log(count / num_documents) for category, count in category_counts.items()}
        
    return prior, bigDoc

log_prior, big_doc = calculate_prior_and_bigdoc(train_data['content_ready'], train_data['category'])

### Calculate Likelihood with Laplace Smoothing
 For each word in your vocabulary, calculate the likelihood of the word given each class. 

In [13]:
# Separate features and target
freq_words = bow.drop('class_category', axis=1)
categories = bow['class_category']

In [14]:
def estimate_likelihood(texts, labels, vocabulary):
    # Initialize counters for each class
    documents = [' '.join(doc) for doc in texts]

    word_counts_per_class = {label: Counter() for label in set(labels)}
    total_words_per_class = {label: 0 for label in set(labels)}
    
    # Count words in each class
    for text, label in zip(documents, labels):
        counts = Counter(text.split())  # Assuming text is preprocessed
        word_counts_per_class[label] += counts
        total_words_per_class[label] += sum(counts.values())
    
    # Number of features (unique words in vocabulary)
    num_features = len(vocabulary)

    # Calculate likelihood with Laplace smoothing
    likelihood = {
        label: {word: np.log((word_counts_per_class[label][word] + 1) / (total_words_per_class[label] + num_features))
                for word in vocabulary} for label in word_counts_per_class
    }

    return likelihood

likelihood = estimate_likelihood(train_data['content_ready'], train_data['category'], vocabulary)


In [15]:
likelihood

{'lin-oil': {'191.15': -10.5947830197326,
  'turner': -10.5947830197326,
  'overbuilding': -10.5947830197326,
  '646.7': -10.5947830197326,
  'barometer': -10.5947830197326,
  'aprjun': -10.5947830197326,
  '943938': -10.5947830197326,
  '3.9375': -10.5947830197326,
  'francophone': -10.5947830197326,
  'scis': -10.5947830197326,
  '3288000': -10.5947830197326,
  'precise': -10.5947830197326,
  'blown': -10.5947830197326,
  '336': -10.5947830197326,
  '119967': -10.5947830197326,
  '700000': -10.5947830197326,
  'cpac': -10.5947830197326,
  'endorsing': -10.5947830197326,
  '109.3': -10.5947830197326,
  'mato': -10.5947830197326,
  'rmuc': -10.5947830197326,
  'carlucci': -10.5947830197326,
  'chiefly': -10.5947830197326,
  '4104000': -10.5947830197326,
  '10012': -10.5947830197326,
  '6805951': -10.5947830197326,
  '398.4': -10.5947830197326,
  'sterling': -10.5947830197326,
  'rich': -10.5947830197326,
  'firedamaged': -10.5947830197326,
  'huashan': -10.5947830197326,
  'goldbacked'

## testing and evaluating

In [16]:
def classify_new_document(document_tokens, vocabulary, prior, log_likelihood):
    # Count the frequencies of words in the document using Counter
    document_vector = Counter(document_tokens)
    
    # Filter out words not in the training vocabulary
    document_vector = {word: freq for word, freq in document_vector.items() if word in vocabulary}
    
    class_posteriors = {}
    
    for class_ in prior:
        # Start with the log prior probability
        class_posteriors[class_] = prior[class_]
        
        # Incrementally update the posterior probability for words in the document
        for word, freq in document_vector.items():
            if word in log_likelihood[class_]:
                class_posteriors[class_] += freq * log_likelihood[class_][word]
    
    # Predict the class with the highest posterior probability
    predicted_class = max(class_posteriors, key=class_posteriors.get)
    
    return predicted_class

predicted_class = test_data['content_ready'].apply(
    lambda doc: classify_new_document(doc, vocabulary, log_prior, likelihood)
)
# 4 min - 5 min 

In [17]:
test_data['predicted_class'] = predicted_class

In [18]:
true_labels = test_data['category'].tolist() 
predicted_labels = test_data['predicted_class'].tolist()

In [19]:
results['naive_bayes_in_hand'] = f1_score(true_labels, predicted_labels, average='macro')

# Naive bayes model, with tf-idf 

In [20]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, train_data['category'])

In [21]:
y_pred = nb_classifier.predict(X_test_tfidf)

In [22]:
results['naive_bayes_with_tfIdf'] = f1_score(test_data['category'], y_pred, average='macro')

# Naive bayes with embeding: word2vac

In [23]:
# Select the best feature and fit the data to Bayes Classifier

def select_best_features_and_fit(x_train, y_train, x_val):
    '''
    return predicted values
    '''
    selector = SelectKBest(f_classif, k = min(10, x_train.shape[1]))
    selector.fit(x_train, y_train)
    x_train_selector = selector.transform(x_train).astype('float32')
    x_val_selector = selector.transform(x_val).astype('float32')
    clf = GaussianNB()
    clf.fit(x_train_selector, y_train)
    y_preds = clf.predict(x_val_selector)
    return y_preds

In [24]:
y_preds = select_best_features_and_fit(x_train_w2v, 
                                       train_data['category'], 
                                       x_test_w2v)

In [25]:
results['naive_bayes_word2vac'] = f1_score(test_data['category'], y_preds, average='macro')

# Naive bayes with Glove

In [26]:
model = GaussianNB()

# Train the model
model.fit(vectorized_train_data, train_data['category'])

In [27]:
# Make predictions
y_pred = model.predict(vectorized_test_data)

In [28]:
results['naive_bayes_glove'] = f1_score(test_data['category'], y_pred, average='macro')

# Random forest with tf-idf

In [59]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
rf_classifier.fit(X_train_tfidf, train_data['category'])

# Make predictions on the testing set
y_pred_rf = rf_classifier.predict(X_test_tfidf) 

In [60]:
# Calculate the F1 score of the Random Forest classifier
f1_rf = f1_score(test_data['category'], y_pred_rf, average='macro')
print("Random Forest F1 Score:", f1_rf)

Random Forest F1 Score: 0.1601978532054457


In [61]:
results["random_forest_tfidf"] = f1_rf

# Random forest with word2vec

In [62]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the classifier using Word2Vec embeddings
rf_classifier.fit(x_train_w2v, train_data['category']) 

# Make predictions on the testing set
y_pred_rf = rf_classifier.predict(x_test_w2v)


In [63]:
# Calculate the F1 score of the Random Forest classifier
f1_rf = f1_score(test_data['category'], y_pred_rf, average='macro')
print("Random Forest F1 Score:", f1_rf)

Random Forest F1 Score: 0.17076505954331528


In [64]:
results["random_forest_word2vec"] = f1_rf

# Random forest with GloVe

In [65]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the classifier using GloVe embeddings
rf_classifier.fit(vectorized_train_data, train_data['category'])

# Make predictions on the testing set
y_pred_rf = rf_classifier.predict(vectorized_test_data)

In [66]:
# Calculate the F1 score of the Random Forest classifier
f1_rf = f1_score(test_data['category'], y_pred_rf, average='macro')
print("Random Forest F1 Score:", f1_rf)

Random Forest F1 Score: 0.13331356714931292


In [67]:
results['random_forest_glove'] = f1_rf

# SVM with tf-idf

In [69]:
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)
svm_classifier.fit(X_train_tfidf, train_data['category'])

# Make predictions on the testing set
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Calculate the F1 score of the SVM classifier
f1_svm = f1_score(test_data['category'], y_pred_svm, average='macro')
print("SVM F1 Score:", f1_svm)

SVM F1 Score: 0.3266578424152612


In [70]:
results['SVM_tfidf'] = f1_svm

# SVM with word2vec

In [71]:
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)
svm_classifier.fit(x_train_w2v, train_data['category'])

# Make predictions on the testing set
y_pred_svm = svm_classifier.predict(x_test_w2v)

# Calculate the F1 score of the SVM classifier
f1_svm = f1_score(test_data['category'], y_pred_svm, average='macro')
print("SVM F1 Score:", f1_svm)

SVM F1 Score: 0.2552120297183989


In [72]:
results['SVM_word2vec'] = f1_svm

# SVM with GloVe

In [73]:
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)
svm_classifier.fit(vectorized_train_data, train_data['category'])

# Make predictions on the testing set
y_pred_svm = svm_classifier.predict(vectorized_test_data)

# Calculate the F1 score of the SVM classifier
f1_svm = f1_score(test_data['category'], y_pred_svm, average='macro')
print("SVM F1 Score:", f1_svm)

SVM F1 Score: 0.28820526018871345


In [74]:
results['SVM_glove'] = f1_svm

# LSTM with tf-idf

In [79]:
label_encoder = LabelEncoder()
encoded_classes = label_encoder.fit_transform(train_data['category'])
encoded_test_classes = label_encoder.transform(test_data['category'])

In [82]:
lstm_word2vec_embedding = Sequential()
lstm_word2vec_embedding.add(LSTM(units=128, input_shape=(x_train_w2v.shape[1], 1)))
lstm_word2vec_embedding.add(Dense(units=91, activation='softmax'))
lstm_word2vec_embedding.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_word2vec_embedding.fit(x_train_w2v, encoded_classes, epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1df7efbc0d0>

In [83]:
predictions = np.argmax(lstm_word2vec_embedding.predict(x_test_w2v), axis=-1)
f1_lstm = f1_score(encoded_test_classes, predictions, average='macro')
print("Macro F1 Score:", f1_lstm)

Macro F1 Score: 0.1040756439193718


In [84]:
results['LSTM_word2vec'] = f1_lstm

# LSTM with tf-idf

In [85]:
lstm_word2vec_embedding = Sequential()
lstm_word2vec_embedding.add(LSTM(units=128, input_shape=(X_train_tfidf.shape[1], 1)))
lstm_word2vec_embedding.add(Dense(units=91, activation='softmax'))
lstm_word2vec_embedding.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_word2vec_embedding.fit(X_train_tfidf, encoded_classes, epochs=10, batch_size=16)

Epoch 1/10


ValueError: in user code:

    File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_3' (type Sequential).
    
    Input 0 of layer "lstm_3" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 39400)
    
    Call arguments received by layer 'sequential_3' (type Sequential):
      • inputs=SparseTensor(indices=Tensor("DeserializeSparse:0", shape=(None, 2), dtype=int64), values=Tensor("DeserializeSparse:1", shape=(None,), dtype=float32), dense_shape=Tensor("stack:0", shape=(2,), dtype=int64))
      • training=True
      • mask=None


In [None]:
predictions = np.argmax(lstm_word2vec_embedding.predict(X_test_tfidf), axis=-1)
f1_lstm = f1_score(encoded_test_classes, predictions, average='macro')
print("Macro F1 Score:", f1_lstm)

In [None]:
results['LSTM_tfidf'] = f1_lstm

# LSTM with GloVe

In [86]:
lstm_word2vec_embedding = Sequential()
lstm_word2vec_embedding.add(LSTM(units=128, input_shape=(vectorized_train_data.shape[1], 1)))
lstm_word2vec_embedding.add(Dense(units=91, activation='softmax'))
lstm_word2vec_embedding.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_word2vec_embedding.fit(vectorized_train_data, encoded_classes, epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1df81ce56c0>

In [87]:
predictions = np.argmax(lstm_word2vec_embedding.predict(vectorized_test_data), axis=-1)
f1_lstm = f1_score(encoded_test_classes, predictions, average='macro')
print("Macro F1 Score:", f1_lstm)

Macro F1 Score: 0.08620682245527397


In [88]:
results['LSTM_glove'] = f1_lstm

# Save results

In [89]:
# Write the updated results back to the file
with open('../data/result.json', 'w') as file:
    json.dump(results, file)