## Text Classification with Pytorch

In [2]:
import sys, os
import random
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pytorch imports
import torch

print(
    f"Using Pytorch version {torch.__version__}. "
    + f'GPU {"is available :)" if torch.cuda.is_available() else "is not available :("}'
)
import torch.nn as nn
from torchvision import datasets, transforms
import torchmetrics
import torchsummary

# My helper functions for training/evaluating etc.
import torch_training_toolkit as t3

SEED = t3.seed_all()
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Using Pytorch version 2.0.1+cu117. GPU is not available :(


In [4]:
DATASET_BASE_PATH = pathlib.Path(os.getcwd()) / "data" / "sentiment_labelled_sentences"
assert os.path.exists(DATASET_BASE_PATH), f"FATAL: {DATASET_BASE_PATH} - path does not exist!"

dataset_paths = {
    "yelp": DATASET_BASE_PATH / "yelp_labelled.txt",
    "amazon": DATASET_BASE_PATH / "amazon_cells_labelled.txt",
    "imdb": DATASET_BASE_PATH / "imdb_labelled.txt",
}

df_list = []
for source, filepath in dataset_paths.items():
    df = pd.read_csv(str(filepath), names=["sentence", "label"], sep="\t")
    df["source"] = source
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [5]:
# let's see how sklearn's CountVectorizer helps us build a vocab
from sklearn.feature_extraction.text import CountVectorizer

sentences = ["John likes ice cream!", "John hates chocolate."]
vectorizer = CountVectorizer(min_df=0.0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [7]:
# this is also called bag-of-words technique
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

Now let us perform the same action on our entire corpus

In [8]:
sentences = df["sentence"].values
labels = df["label"].values
sentences[:5], labels[:5]

(array(['Wow... Loved this place.', 'Crust is not good.', 'Not tasty and the texture was just nasty.', 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.', 'The selection on the menu was great and so were the prices.'], dtype=object),
 array([1, 0, 0, 1, 1]))

Create the train & test datasets for model training

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    sentences,
    labels,
    test_size=0.25,
    random_state=SEED,
)
print(
    f"X_train.shape: {X_train.shape} - y_train.shape: {y_train.shape} - "
    f"X_test.shape: {X_test.shape} - y_test.shape: {y_test.shape}"
)

X_train.shape: (2061,) - y_train.shape: (2061,) - X_test.shape: (687,) - y_test.shape: (687,)


In [10]:
# and we will vectorize the data
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
X_train[0], X_test[0]

(<1x4475 sparse matrix of type '<class 'numpy.int64'>'
 	with 16 stored elements in Compressed Sparse Row format>,
 <1x4475 sparse matrix of type '<class 'numpy.int64'>'
 	with 11 stored elements in Compressed Sparse Row format>)

### Base `LogisticRegression` model

In [11]:
# base classification model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(f"LogisticRegression (base model) acc: {score:.3f}")

LogisticRegression (base model) acc: 0.806


### Build a deep-learning model with Pytorch

In [13]:
MODEL_SAVE_PATH = pathlib.Path(os.getcwd()) / "model_states" / "pyt_text_classfication.pt"
input_dim = X_train.shape[1]


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            t3.Linear(input_dim, 10),
            nn.ReLU(),
            t3.Linear(10, 1),
            nn.Sigmoid(),
        )

    def forward(self, inp):
        return self.net(inp)


loss_fn = nn.BCELoss()
metrics_map = {
    "acc": torchmetrics.classification.BinaryAccuracy(),
}
trainer = t3.Trainer(
    loss_fn=loss_fn,
    device=DEVICE,
    metrics_map=metrics_map,
    epochs=100,
    batch_size=64,
)

In [14]:
# train the model
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
hist = trainer.fit(
    model,
    optimizer,
    train_dataset=(X_train, y_train),
)
hist.plot_metrics(
    title="Model Performance",
    fig_size=(16, 8),
)
t3.save_model(model, MODEL_SAVE_PATH)
del model

TypeError: expected np.ndarray (got csr_matrix)