In [47]:
import pandas as pd
from textblob import TextBlob
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
tqdm.pandas()
from statistics import mean
from sklearn.utils import resample
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import scipy.sparse as sp

In [4]:
downsampled = pd.read_csv('./downsampled_fixed_spelling.csv')

In [68]:
%%time
#attempt a neural network approach
np.random.seed(10)
total_docs = 1500
X = downsampled['review_text']
y = downsampled['rating']-1#because nn

def track_progress(progress):
    return tqdm(progress, total=total_docs, desc="Vectorizing text data")

#only the 1500 most frequent tokens are used
tfidfconverter = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)
vectorized_data = tfidfconverter.fit_transform(track_progress(X))

X_train, X_test, y_train, y_test = train_test_split(vectorized_data, y, test_size=0.2, random_state=0)


Vectorizing text data: 15000it [00:00, 31369.54it/s]           


CPU times: total: 359 ms
Wall time: 549 ms


In [69]:
%%time

class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim, output_dim, dropout):
        super().__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        
        # Fully-connected layers
        self.fc = nn.Linear(len(filter_sizes) * num_filters, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text shape: [batch_size, seq_len]
        embedded = self.embedding(text)
        # embedded shape: [batch_size, seq_len, embedding_dim]
        
        # Add channel dimension for convolution
        embedded = embedded.unsqueeze(1)
        # embedded shape: [batch_size, 1, seq_len, embedding_dim]
        
        # Apply convolutional layers
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        # conved[i] shape: [batch_size, num_filters, seq_len - filter_sizes[i] + 1]
        
        # Apply max pooling
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled[i] shape: [batch_size, num_filters]
        
        # Concatenate pooled features
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat shape: [batch_size, len(filter_sizes) * num_filters]
        
        # Apply fully-connected layers
        hidden = nn.functional.relu(self.fc(cat))
        # hidden shape: [batch_size, hidden_dim]
        
        # Apply output layer
        output = self.output(hidden)
        # output shape: [batch_size, output_dim]
        
        return output

# Define hyperparameters
vocab_size = 10000
embedding_dim = 100
num_filters = 100
filter_sizes = [3, 4, 5]
hidden_dim = 256
output_dim = 5
dropout = 0.5
num_epochs =5
# Initialize model
model = CNNClassifier(vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim, output_dim, dropout)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# dataset = sp.hstack([X_train, sp.coo_matrix(y_train.to_numpy()).T]).toarray()
inputs = torch.tensor(X_train.toarray())
targets = torch.tensor(y_train.values)
dataset = TensorDataset(inputs, targets)
train_loader = DataLoader(dataset)

# Train model
for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)
        loss = criterion(output, target)

        # Backward pass
        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}")

<torch.utils.data.dataloader.DataLoader object at 0x000001BE02C14A90>
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
CPU times: total: 3h 5min 39s
Wall time: 25min 34s


In [70]:
outputs = model(torch.LongTensor(X_test.todense()))
_, y_pred = torch.max(outputs.data, 1)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.59      0.24      0.34      1388
           1       0.01      0.71      0.02         7
           2       0.03      0.14      0.06       143
           3       0.00      0.00      0.00         2
           4       0.64      0.27      0.38      1460

    accuracy                           0.25      3000
   macro avg       0.25      0.27      0.16      3000
weighted avg       0.58      0.25      0.35      3000

