Initial preprocessing

In [None]:
# ----- PRE PROCESSING -----

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import json
import pandas as pd

nltk.download('punkt_tab')
nltk.download('stopwords')

# stemming tool from nltk
stemmer = PorterStemmer()
# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def get_tokens(text):
  # turn document into lowercase
  lowers = text.lower()
  # remove punctuations
  no_punctuation = lowers.translate(remove_punctuation_map)
  # tokenize document
  tokens = nltk.word_tokenize(no_punctuation)
  # remove stop words
  filtered = [w for w in tokens if not w in stopwords.words('english')]
  # stemming process
  stemmed = []
  for item in filtered:
      stemmed.append(stemmer.stem(item))
  # final unigrams
  return stemmed

traningData = pd.read_csv('24_train_3.csv', header='infer')
# Use HW 0  token filtering step
traningData['Processed_Text'] = traningData['Text'].apply(lambda x: ' '.join(get_tokens(str(x))))
traningData

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nsano\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nsano\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ArticleId,Text,Category,Processed_Text
0,1429,sfa awaits report over mikoliunas the scottish...,sport,sfa await report mikoliuna scottish footbal as...
1,1896,parmalat to return to stockmarket parmalat th...,business,parmalat return stockmarket parmalat italian d...
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport,edu blast arsen arsen brazilian midfield edu h...
3,2178,henman decides to quit davis cup tim henman ha...,sport,henman decid quit davi cup tim henman retir gr...
4,194,french suitor holds lse meeting european stock...,business,french suitor hold lse meet european stock mar...
...,...,...,...,...
995,1250,blair damaged by blunkett row a majority of ...,politics,blair damag blunkett row major voter 68 believ...
996,1639,a november to remember last saturday one news...,sport,novemb rememb last saturday one newspap procla...
997,916,highbury tunnel players in clear the football ...,sport,highburi tunnel player clear footbal associ sa...
998,2217,top stars join us tsunami tv show brad pitt r...,entertainment,top star join us tsunami tv show brad pitt rob...


Feature extraction methods

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch

# ----------- TfidfVectorizer method - convert to number - use TFIDF value -----------
tfidf_vectorizer = TfidfVectorizer() # like hw 1
X_tfidf = tfidf_vectorizer.fit_transform(traningData['Processed_Text'])

In [None]:
# ----------- Bigram Unigram method - cnovert words into matrix of word counts -----------
count_vectorizer = CountVectorizer(ngram_range=(1, 2))  # shape = 1000, 181043 -- 181043 unique words and bigrams for all text
X_count = count_vectorizer.fit_transform(traningData['Processed_Text'])

In [None]:
# ----------- Bert method - convert words into vector -----------
# load bert model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embedding(text):
    inputs = bert_tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=512
        )
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

X_bert_list = []
for text in traningData['Processed_Text']:
    # get BERT embedding for each text
    bert_embedding = get_bert_embedding(text)
    X_bert_list.append(bert_embedding)
X_bert = np.array(X_bert_list)
print(pd.DataFrame(X_bert))

#  ----------- GLoVe vector method - convert words into vector -----------
glove_vectors = {}
# loadin golve vectors
with open('glove.6B.50d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32') # exclude first elemetn of values
        glove_vectors[word] = vector

def get_glove_embedding(text):
    words = get_tokens(text)
    vectors = [glove_vectors[word] for word in words if word in glove_vectors]
    return np.mean(vectors, axis=0)

X_glove_list = []
for text in traningData['Processed_Text']:
    # get GloVe embedding for each text
    glove_embedding = get_glove_embedding(text)
    X_glove_list.append(glove_embedding)
X_glove = np.array(X_glove_list)
print(pd.DataFrame(X_glove))



Train NN

In [None]:

# Encode labels into numbers
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(traningData['Category'])

# Convert data to tensors
X_tensor = torch.tensor(X_tfidf.toarray(), dtype=torch.float32)
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

# Define Neural Network Model
class NewsClassifier(nn.Module):
    def __init__(self, input_size):
        super(NewsClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.output = nn.Linear(128, 5)  # 5 categories
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.output(x)  # No softmax needed, CrossEntropyLoss does it


Training with learning rate 0.0001
Avg Validation Accuracy for lr 0.0001: 0.2340
Training with learning rate 0.0003
Avg Validation Accuracy for lr 0.0003: 0.3810
Training with learning rate 0.001
Avg Validation Accuracy for lr 0.001: 0.6390
Training with learning rate 0.003
Avg Validation Accuracy for lr 0.003: 0.8720
Training with learning rate 0.01
Avg Validation Accuracy for lr 0.01: 0.9620
Training with learning rate 0.03
Avg Validation Accuracy for lr 0.03: 0.8820
Training with learning rate 0.1
Avg Validation Accuracy for lr 0.1: 0.9150


In [None]:

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for lr in [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1]:  # Test different learning rates
    print(f"Training with learning rate {lr}")
    accuracies = []

    for train_idx, val_idx in kf.split(X_tensor):
        X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
        y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]

        model = NewsClassifier(input_size=X_tensor.shape[1])
        optimizer = optim.Adam(model.parameters(), lr=lr)  # Change to SGD or RMSprop to test
        criterion = nn.CrossEntropyLoss()
        epochs = 10  # Can increase for better performance

        for epoch in range(epochs):
            optimizer.zero_grad()
            output = model(X_train)
            loss = criterion(output, y_train)
            loss.backward()
            optimizer.step()

        # Evaluate model
        with torch.no_grad():
            val_preds = model(X_val).argmax(dim=1)
            accuracy = (val_preds == y_val).float().mean().item()
            accuracies.append(accuracy)

    print(f"Avg Validation Accuracy for lr {lr}: {sum(accuracies)/len(accuracies):.4f}")