# TTDS 2024/25 Lecture 17: Text Classification Practical

Instructor: Björn Ross 20 November 2024

Created by Steve Wilson November 2020, modified by Björn Ross

## Let's build a text classifier!

### 1. Setup

In [1]:
import sklearn
print(sklearn.__version__)

1.5.2


In [2]:
# some prereqs:
import collections

# regular expressions
import re

# for string.punctuation: list of punctuation characters
import string

# import this for storing our BOW format
import scipy
from scipy import sparse

# numpy for more easily storing multidimensional data
import numpy as np

# scikit learn. Contains lots of ML models we can use
# import the library for support vector machines
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import classification_report

**Note:**
* Any package in the Python standard library (https://docs.python.org/3/library/) can be used in the coursework, as can NumPy and SciPy, any function in scikit-learn, etc. You are encouraged to experiment!
* If you are unsure, just ask on Piazza!

### 2. Check the data format

In [3]:
# check out the data (use ! for command line operation)
!cat Tweets.14cat.train | head -5

'cat' is not recognized as an internal or external command,
operable program or batch file.


### 3. Load and preprocess the lab data

In [20]:
# load our data
training_data = open('../data/collections/Tweets.14cat.train',encoding="latin-1").read()
test_data     = open('../data/collections/Tweets.14cat.test',encoding="latin-1").read()
# we will save the testing data for later...

In [21]:
training_data[:10]

'4502931410'

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
# example of how the tokenization part will work
# q: what important features might this remove?
invalid_chars = re.compile(f'[{string.punctuation}]')
invalid_chars.sub('',"Hello, World! #Tweets").lower().split()

['hello', 'world', 'tweets']

In [23]:
# convert to list of lists: documents containing tokens
# and return the list of categories
# also get the vocabulary
def preprocess_data(data):
    
    chars_to_remove = re.compile(f'[{string.punctuation}]')
    
    documents = []
    categories = []
    vocab = set([])
    
    lines = data.split('\n')
    
    for line in lines:
        # make a dictionary for each document
        # word_id -> count (could also be tf-idf score, etc.)
        line = line.strip()
        if line:
            # split on tabs, we have 3 columns in this tsv format file
            tweet_id, tweet, category = line.split('\t')

            # process the words
            words = chars_to_remove.sub('',tweet).lower().split()
            for word in words:
                vocab.add(word)
            # add the list of words to the documents list
            documents.append(words)
            # add the category to the categories list
            categories.append(category)
            
    return documents, categories, vocab

In [24]:
%time
# ^ see how long this takes
# preprocess the data
preprocessed_training_data, training_categories, train_vocab = preprocess_data(training_data)
preprocessed_test_data, test_categories, test_vocab = preprocess_data(test_data)

print(f"Training Data has {len(preprocessed_training_data)} " +
      f"documents and vocab size of {len(train_vocab)}")
print(f"Test Data has {len(preprocessed_test_data)} " +
      f"documents and vocab size of {len(test_vocab)}")
print(f"There were {len(set(training_categories))} " +
      f"categories in the training data and {len(set(test_categories))} in the test.")

CPU times: total: 0 ns
Wall time: 0 ns
Training Data has 2503 documents and vocab size of 12726
Test Data has 625 documents and vocab size of 4365
There were 14 categories in the training data and 14 in the test.


In [25]:
# check the most common categories in the training data
print(collections.Counter(training_categories).most_common())

[('Gaming', 220), ('Autos & Vehicles', 210), ('Howto & Style', 207), ('Sports', 203), ('Travel & Events', 196), ('Science & Technology', 189), ('Film & Animation', 178), ('Pets & Animals', 177), ('News & Politics', 168), ('Music', 160), ('Entertainment', 159), ('Comedy', 153), ('Education', 142), ('Nonprofits & Activism', 141)]


### 4. Set up mappings for word and category IDs

In [28]:
# convert the vocab to a word id lookup dictionary
# anything not in this will be considered "out of vocabulary" OOV
word2id = {}
for word_id,word in enumerate(train_vocab):
    word2id[word] = word_id
    
# and do the same for the categories
cat2id = {}
for cat_id,cat in enumerate(set(training_categories)):
    cat2id[cat] = cat_id
    
print("The word id for dog is",word2id['dog'])
print("The category id for Pets & Animals is",cat2id['Pets & Animals'])

The word id for dog is 1439
The category id for Pets & Animals is 0


In [29]:
cat2id

{'Pets & Animals': 0,
 'Nonprofits & Activism': 1,
 'Comedy': 2,
 'Film & Animation': 3,
 'Education': 4,
 'Autos & Vehicles': 5,
 'Music': 6,
 'Science & Technology': 7,
 'Travel & Events': 8,
 'Sports': 9,
 'Howto & Style': 10,
 'News & Politics': 11,
 'Entertainment': 12,
 'Gaming': 13}

### 5. Convert data to bag-of-words format

In [30]:
# build a BOW representation of the files: use the scipy 
# data is the preprocessed_data
# word2id maps words to their ids
def convert_to_bow_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            # if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
    
    return X

In [31]:
%%time 
X_train = convert_to_bow_matrix(preprocessed_training_data, word2id)

CPU times: total: 266 ms
Wall time: 409 ms


In [32]:
# check some docs
print("First 3 documents are:",X_train[:3])

First 3 documents are:   (0, 5761)	1.0
  (0, 4313)	1.0
  (0, 10417)	1.0
  (0, 11926)	1.0
  (0, 10719)	1.0
  (0, 1748)	1.0
  (0, 9809)	1.0
  (0, 943)	1.0
  (0, 11930)	1.0
  (0, 3139)	1.0
  (0, 10366)	1.0
  (0, 8733)	1.0
  (0, 7440)	1.0
  (0, 6201)	1.0
  (0, 11635)	1.0
  (1, 5620)	1.0
  (1, 10516)	1.0
  (1, 2134)	1.0
  (1, 11300)	1.0
  (2, 397)	2.0
  (2, 981)	2.0
  (2, 9119)	1.0
  (2, 12604)	1.0
  (2, 6680)	1.0
  (2, 7876)	1.0
  (2, 3597)	1.0
  (2, 1501)	1.0
  (2, 4517)	1.0
  (2, 2097)	1.0
  (2, 7751)	1.0
  (2, 7660)	1.0


In [33]:
y_train = [cat2id[cat] for cat in training_categories]

In [34]:
# check the first 3 categories
print(y_train[:3])

[0, 0, 0]


In [35]:
X_train

<2503x12727 sparse matrix of type '<class 'numpy.float64'>'
	with 32496 stored elements in Dictionary Of Keys format>

### 6. Train an SVM model

In [36]:
# Let's train a model: now that the setup is done, it's a piece of cake!
%time
# instantiate an SVM classification model
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
# you can set various model hyperparamters here
model = sklearn.svm.SVC(C=1000, kernel ="linear")
# then train the model!
model.fit(X_train,y_train)

CPU times: total: 0 ns
Wall time: 0 ns


In [37]:
# make a prediction
sample_text = ['retweet','if','you','are','a','netflix','person']
# create just a single vector as input (as a 1 x V matrix)
sample_x_in = scipy.sparse.dok_matrix((1,len(word2id)+1))
for word in sample_text:
    sample_x_in[0,word2id[word]] += 1

# what does the example document look like?
print(sample_x_in)
prediction = model.predict(sample_x_in)
# what category was predicted?
print("Prediction was:",prediction[0])
# what category was that?
print(cat2id)

  (0, 3394)	1.0
  (0, 7590)	1.0
  (0, 4075)	1.0
  (0, 9385)	1.0
  (0, 8792)	1.0
  (0, 6827)	1.0
  (0, 8769)	1.0
Prediction was: 0
{'Pets & Animals': 0, 'Nonprofits & Activism': 1, 'Comedy': 2, 'Film & Animation': 3, 'Education': 4, 'Autos & Vehicles': 5, 'Music': 6, 'Science & Technology': 7, 'Travel & Events': 8, 'Sports': 9, 'Howto & Style': 10, 'News & Politics': 11, 'Entertainment': 12, 'Gaming': 13}


### 7. Evaluating the model

In [38]:
# evaluate on training data: how well did we fit to the data we trained on?
y_train_predictions = model.predict(X_train)

# now can compute any metrics we care about. Let's quickly do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions,true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

accuracy = compute_accuracy(y_train_predictions,y_train)
print("Accuracy:",accuracy)
# how did we do?

Accuracy: 1.0


Is that a good score? The score can be informative, but it isn't hard to do well on the training data.

### 8. Using the test set

In [39]:
# prepare test data in the same was as training data
X_test = convert_to_bow_matrix(preprocessed_test_data, word2id)
y_test = [cat2id[cat] for cat in test_categories]

In [40]:
# now evaluate on test data: data the model has NOT seen during training time
# make sure you do NOT update the model, only get predictions from it
y_test_predictions = model.predict(X_test)
y_test_predictions

#accuracy = compute_accuracy(y_test_predictions,y_test)
#print("Accuracy:",accuracy)

array([ 0,  3,  9,  5,  2, 11, 13,  0,  3,  1,  8,  4, 11,  3,  5,  0, 12,
        7,  4,  0,  4,  7,  0, 11,  8,  6,  8,  5,  5,  1,  3, 11, 13,  2,
       10,  8,  9,  2,  2,  8,  6,  9,  9,  4,  8,  7,  7,  9,  0,  7,  8,
        7,  8,  0, 10, 13, 12, 12,  6, 10, 10, 10, 10, 10, 13,  0,  1,  6,
        6,  9, 12, 12, 12,  9,  0,  9, 12,  3,  9, 11,  0,  0,  0,  8,  5,
        9, 11, 12, 12,  3,  4, 11,  2,  1,  5, 10, 13,  8, 13,  5,  4,  8,
        8,  3, 10, 11,  0,  1,  8,  9,  2,  9,  2,  7, 12,  5,  5,  5,  5,
       12,  8,  2,  5,  6, 11, 13,  1,  3, 12, 12, 12, 12, 11,  5, 10,  8,
        5,  5,  7,  4,  5,  4,  8,  8,  6,  4, 13, 13,  4, 11,  0,  7,  0,
        0,  5,  3, 12, 10, 11,  3, 12, 13, 13, 13, 11,  0,  9,  9,  9, 11,
        7, 12,  8, 12, 12,  4,  1,  6,  1, 13, 13,  3, 11,  1, 12, 12,  5,
        7,  5,  8,  7, 12,  3, 12,  8,  7, 11, 12,  7,  8, 13, 11,  1, 12,
        1,  6,  6,  5,  2, 11,  0,  8,  7,  6, 12, 12,  1, 12,  0,  1,  5,
       13,  4,  6,  6,  1

In [41]:
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(y_test, y_test_predictions, target_names=cat_names))

                       precision    recall  f1-score   support

       Pets & Animals       0.73      0.80      0.77        45
Nonprofits & Activism       0.42      0.45      0.44        38
               Comedy       0.59      0.53      0.56        38
     Film & Animation       0.51      0.41      0.46        46
            Education       0.62      0.49      0.55        41
     Autos & Vehicles       0.82      0.78      0.80        51
                Music       0.57      0.50      0.53        40
 Science & Technology       0.36      0.37      0.37        43
      Travel & Events       0.55      0.57      0.56        54
               Sports       0.59      0.51      0.55        53
        Howto & Style       0.81      0.75      0.78        40
      News & Politics       0.28      0.54      0.37        37
        Entertainment       0.83      0.71      0.77        49
               Gaming       0.62      0.66      0.64        50

             accuracy                           0.58 

In [42]:
# what would a simple baseline be? How about most common category from before (Gaming)?
# we should *definitely* be doing better than this! Otherwise the model is not helping at all
baseline_predictions = [cat2id['Gaming']] * len(y_test)
baseline_accuracy = compute_accuracy(baseline_predictions,y_train)
print("Accuracy:",baseline_accuracy)

Accuracy: 0.0848


In [43]:
# trying a different model...
# how about a random forest classifier?
%time
model = sklearn.ensemble.RandomForestClassifier()
model.fit(X_train,y_train)

y_train_predictions = model.predict(X_train)
print("Train accuracy was:",compute_accuracy(y_train_predictions,y_train))
y_test_predictions = model.predict(X_test)
print("Test accuracy was:",compute_accuracy(y_test_predictions,y_test))

CPU times: total: 0 ns
Wall time: 0 ns
Train accuracy was: 1.0
Test accuracy was: 0.6384


In [44]:
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(y_test, y_test_predictions, target_names=cat_names))

                       precision    recall  f1-score   support

       Pets & Animals       0.79      0.84      0.82        45
Nonprofits & Activism       0.76      0.42      0.54        38
               Comedy       0.74      0.61      0.67        38
     Film & Animation       0.53      0.65      0.58        46
            Education       0.72      0.63      0.68        41
     Autos & Vehicles       0.87      0.88      0.87        51
                Music       0.59      0.60      0.59        40
 Science & Technology       0.38      0.37      0.38        43
      Travel & Events       0.47      0.52      0.50        54
               Sports       0.68      0.51      0.58        53
        Howto & Style       0.81      0.75      0.78        40
      News & Politics       0.30      0.70      0.42        37
        Entertainment       0.95      0.71      0.81        49
               Gaming       0.92      0.70      0.80        50

             accuracy                           0.64 

### 9. Other models to try?

* Check out all of the multiclass ready models!  https://scikit-learn.org/stable/modules/multiclass.html
* Define your own features, vary preprocessing steps, model parameters, .....
* Try fine-tuning transformer models for text classification instead of extracting traditional features: https://huggingface.co/docs/transformers/tasks/sequence_classification
* Try generating embeddings with a transformer models, then using them as additional features in your classifier (in addition to your own): https://www.sbert.net/