## Loading the dataset

In [None]:
import pandas as pd
dataset_raw = pd.read_csv('bbc-text.csv', encoding = 'unicode_escape')

# Process the raw dataset into an array of instances
dataset_full = []

for index in range(len(dataset_raw)):
    label = dataset_raw["category"][index]
    dataset_full.append((dataset_raw["text"][index], label))

print("Done")

## Splitting the dataset

In [None]:
import random

# Function to split the dataset
# input: dataset (array of instances)
# output: train, dev, test sets (array of instances)
def get_split_dataset(dataset):
    index_list = []
    train_set = []
    dev_set = []
    test_set = []
    # Compute the sizes of the sets
    # 80% for the training set, 10% for the dev set, 10% for the test set
    dataset_size = len(dataset)
    train_size = int(dataset_size * 0.8)
    dev_size = int(dataset_size * 0.1)
    test_size = dataset_size - train_size - dev_size
    
    # Create a list of indexes
    # 0 corresponds to the training set
    # 1 corresponds to the dev set
    # 2 corresponds to the test set
    for index in range(dataset_size):
        if index < train_size:
            index_list.append(0)
        elif index < train_size + dev_size:
            index_list.append(1)
        elif index < dataset_size:
            index_list.append(2)

    # Randomize the index list, and use it to create the
    # three sets
    random.shuffle(index_list)
    
    for index in range(dataset_size):
        if index_list[index] == 0:
            train_set.append(dataset[index])
        elif index_list[index] == 1:
            dev_set.append(dataset[index])
        elif index_list[index] == 2:
            test_set.append(dataset[index])

    # Randomize the sets once again
    random.shuffle(train_set)
    random.shuffle(dev_set)
    random.shuffle(test_set)

    return train_set, dev_set, test_set

# Randomize the order of the dataset
random.shuffle(dataset_full)
train_set, dev_set, test_set = get_split_dataset(dataset_full)

print(len(dataset_full))
print(len(train_set))
print(len(dev_set))
print(len(test_set))

print("Done")

## Pre-processing

In [None]:
import numpy as np
import nltk
import sklearn
import operator
import requests

lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")

# Function taken from Session 1 of the Practicals
# Transforms a document into an array of tokens
# Input: document
# Output: list of tokens
def get_list_tokens(document):
    # Split the document into sentences
    sentence_split = nltk.tokenize.sent_tokenize(document)
    list_tokens=[]
    for sentence in sentence_split:
        # Split each sentence into words
        list_tokens_sentence = nltk.tokenize.word_tokenize(sentence)
        # Make all the words lowercase and lemmatize them
        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())
    return list_tokens

# Function slightly modified from Session 2 of the Practicals
# Process a dataset's inputs
# Input: training_set (array of instances), num_features (how many features should be kept)
# Output: array of instances, with each input being pre-processed (lemmatized, lowercased, stopwords taken out)
def pre_process(dataset):
    processed_dataset = []
    for instance in dataset:
        # Tokenize the input
        sentence_tokens = get_list_tokens(instance[0])
        processed_document = ""
        # Re-concatenate the tokens
        for word in sentence_tokens:
            if word in stopwords: continue
            processed_document += word + " "

        # Cut the last " " added
        processed_document = processed_document[:-1]

        # Create the processed dataset
        processed_dataset.append([processed_document, instance[1]])

    return processed_dataset

# Similar to pre_process, but for a single input instead of a dataset
# Input: data input in its original form
# Output: pre-processed input
def process_doc(instance_X):
    sentence_tokens = get_list_tokens(instance_X)
    processed_document = ""
    for word in sentence_tokens:
        if word in stopwords: continue
        processed_document += word + " "
    processed_document = processed_document[:-1]
    return processed_document

print("Done")

In [None]:
# Pre-process the training set
processed_train_set = pre_process(train_set)

print("Done")

In [None]:
# See the first instance
print(train_set[:1])
print(processed_train_set[:1])

print("Done")

## Tf-Idf vectorizing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Obtain a tf-idf vectorizer
# Input: max_features (maximum number of features for this vectorizer), training set
# Output: tf-idf vectorizer
def get_tfidf_vectorizer(max_features, train_set):
    vectorizer = TfidfVectorizer(max_features = max_features)
    X_train = []

    for instance in train_set:
        X_train.append(instance[0])
    
    vectorizer.fit(X_train)

    return vectorizer

print("Done")

## Word2Vec

In [None]:
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

W2VEC_NUM_FEATURES = 300

# Outputs the w2vec model trained on Google News
def get_w2vec_vectorizer():
    return api.load('word2vec-google-news-300')

print("Done")

In [None]:
w2vec = get_w2vec_vectorizer()

print("Done")

In [None]:
# Make sure the dimensions of w2vec embeddings are correct
print(len(w2vec['movie']))

print("Done")

## Pos_tag

In [None]:
# Pos tags for the nltk tagger with the universal tagset
pos_tags = [
    "ADJ",
    "ADP",
    "ADV",
    "CONJ",
    "DET",
    "NOUN",
    "NUM",
    "PRT",
    "PRON",
    "VERB",
    ".",
    "X"
]

print("Done")

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')

def get_pos_tagger():
    return nltk.pos_tag

print("Done")

In [None]:
# Get an instance of the pos tagger
pos_tagger = get_pos_tagger()
print("Done")

In [None]:
# Given an input, process it into a vector
# Input: document input, tfidf vectorizer, pos_tagger
def get_vector(instance_X, tfidf_vectorizer, pos_tagger):
    global W2VEC_NUM_FEATURES
    global pos_tags
    # get the pos_tag vector first, before pre-processing the input
    tags = pos_tagger(nltk.tokenize.word_tokenize(instance_X), tagset = 'universal', lang = 'eng')

    pos_tag_vec = np.zeros(len(pos_tags))
    total_pos_tags = 0

    # Create the pos_tag vector by keeping counts of the tags in the document
    for tag_index in range(len(pos_tag_vec)):
        tag = pos_tags[tag_index]
        for index in range(len(tags)):
            (word, word_tag) = tags[index]
            if word_tag == tag:
                pos_tag_vec[tag_index] += 1
                total_pos_tags += 1

    # Normalize the tag counts
    for tag_index in range(len(pos_tag_vec)):
        pos_tag_vec[tag_index] = pos_tag_vec[tag_index] / total_pos_tags

    # print(pos_tag_vec)

    # First, pre-process the input
    processed_instance = process_doc(instance_X)
    tf_vec = tfidf_vectorizer.transform([processed_instance]).todense().A1
    word_list = nltk.tokenize.word_tokenize(processed_instance)
    w2_vec = np.zeros(W2VEC_NUM_FEATURES)
    # Average the w2vec vectors
    for word in word_list:
        word_vec = []
        if word in w2vec.key_to_index:
            word_vec = w2vec[word]
        else:
            word_vec = np.zeros(W2VEC_NUM_FEATURES)
        for index in range(W2VEC_NUM_FEATURES):
            w2_vec[index] = w2_vec[index] + word_vec[index]
    for index in range(W2VEC_NUM_FEATURES):
        w2_vec[index] = w2_vec[index] / len(word_list)
    
    # Combine the 3 representations
    combined_vec = np.concatenate([tf_vec, w2_vec, pos_tag_vec])
    return combined_vec

print("Done")

## Vectorization

In [None]:
tfidf_vec = get_tfidf_vectorizer(1000, processed_train_set)

print("Done")

In [None]:
from sklearn.feature_selection import SelectKBest

X_vec_train = []
Y_train = []
for instance in train_set:
    X_vec_train.append(get_vector(instance[0], tfidf_vec, pos_tagger))
    Y_train.append(instance[1])

print(len(X_vec_train[0]))

## Feature selection

In [None]:
# Output: Feature selector that keeps num_features
def feature_selection(num_features):
    return SelectKBest(k = num_features).fit(X_vec_train, Y_train)

print("Done")

## Model Training

In [None]:
# Process the dev and test sets
X_vec_test = []
Y_test = []

X_vec_dev = []
Y_dev = []

for instance in test_set:
    Y_test.append(instance[1])
    X_vec_test.append(get_vector(instance[0], tfidf_vec, pos_tagger))

for instance in dev_set:
    Y_dev.append(instance[1])
    X_vec_dev.append(get_vector(instance[0], tfidf_vec, pos_tagger))

print("Done")

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

list_num_features = [300, 500, 750, 1000]

log_reg_clf_bst = 0
accuracy_score_bst = 0
feature_selector_bst = 0
num_features_bst = 0

# Train and tune the model

for num_features in list_num_features:
    feature_selector = feature_selection(num_features)
    X_train_new = feature_selector.transform(X_vec_train)
    log_reg_clf = SGDClassifier(loss = 'log_loss').fit(X_train_new, Y_train)
    
    X_dev_new = feature_selector.transform(X_vec_dev)
    Y_dev_pred = log_reg_clf.predict(X_dev_new)
    accuracy = accuracy_score(Y_dev, Y_dev_pred)

    if accuracy > accuracy_score_bst:
        log_reg_clf_bst = log_reg_clf
        accuracy_score_bst = accuracy
        feature_selector_bst = feature_selector
        num_features_bst = num_features

print("Done")

In [None]:
from sklearn.metrics import classification_report

# Use the tuned model to test
X_test_new = feature_selector_bst.transform(X_vec_test)

Y_test_pred = log_reg_clf_bst.predict(X_test_new)


print(num_features_bst)

print(classification_report(Y_test, Y_test_pred))