In [3]:
import torch

In [4]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd

In [5]:
file_path =r"D:\Natural Language Processing\LSTM\data\binary_text_classification_dataset.csv"
data = pd.read_csv(file_path)

In [6]:
data.head()

Unnamed: 0,Text,Label
0,Terrible service and rude staff.,0
1,Highly recommend this to everyone!,1
2,I regret buying this.,0
3,Fantastic service and friendly staff.,1
4,Highly recommend this to everyone!,1


In [8]:
texts = data["Text"].values
labels = data["Label"].values

texts[:10], labels[:10]

(array(['Terrible service and rude staff.',
        'Highly recommend this to everyone!', 'I regret buying this.',
        'Fantastic service and friendly staff.',
        'Highly recommend this to everyone!',
        'Not worth the money at all.',
        'Awful experience, never buying again.',
        'Not worth the money at all.',
        "I love this product, it's amazing!",
        'Very happy with the quality of the product.'], dtype=object),
 array([0, 1, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64))

In [11]:
vectorizer = CountVectorizer(max_features=1000)
x = vectorizer.fit_transform(texts).toarray()

In [16]:
x.shape

(500, 69)

In [17]:
x[0]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=42)

X_train.shape, y_train.shape

((400, 69), (400,))

In [19]:
class TextDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
    
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [20]:
train_dataset

<__main__.TextDataset at 0x1798bf2c890>

In [21]:
class BinaryClassifier(torch.nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassifier, self).__init__()
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 1),
            torch.nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.fc(x)
    
input_dim = X_train.shape[1]
model = BinaryClassifier(input_dim)

In [22]:
model

BinaryClassifier(
  (fc): Sequential(
    (0): Linear(in_features=69, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [24]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

epochs = 10

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    
    for inputs, targets in train_loader:
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {epoch_loss/len(train_loader):.4f}")

Epoch 1/10 - Loss: 0.3991
Epoch 2/10 - Loss: 0.0257
Epoch 3/10 - Loss: 0.0012
Epoch 4/10 - Loss: 0.0003
Epoch 5/10 - Loss: 0.0001
Epoch 6/10 - Loss: 0.0001
Epoch 7/10 - Loss: 0.0001
Epoch 8/10 - Loss: 0.0001
Epoch 9/10 - Loss: 0.0001
Epoch 10/10 - Loss: 0.0001


In [26]:
model.eval()

BinaryClassifier(
  (fc): Sequential(
    (0): Linear(in_features=69, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs).squeeze()
        predicted = (outputs >= 0.5).float()
        correct += (predicted == targets).sum()
        total += targets.size(0)
        
accuracy = correct / total
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [31]:
def predict_text(texts, model, vectorizer):
    processed_texts = vectorizer.transform(texts).toarray()
    inputs = torch.tensor(processed_texts, dtype=torch.float32)
    
    model.eval()
    
    with torch.no_grad():
        outputs = model(inputs).squeeze()
        predicted = (outputs >= 0.5).float()
        
    return predicted.numpy(), outputs.numpy()
    

In [32]:
new_texts = [
    "I absolutely love this product, it's fantastic!",
    "This is the most terrible experience I've ever had.",
    "The quality is okay, but not the best.",
    "Highly satisfied with the service!"
]

In [33]:
predicted, probability = predict_text(new_texts, model, vectorizer)

In [34]:
for text, pred, prob in zip(new_texts, predicted, probability):
    label = "positive" if pred == 1 else "negative"
    
    print(f"{text} : {label} : {prob:.4f}")

I absolutely love this product, it's fantastic! : positive : 1.0000
This is the most terrible experience I've ever had. : negative : 0.0000
The quality is okay, but not the best. : negative : 0.0021
Highly satisfied with the service! : positive : 0.9958


In [36]:
input_query = "I love this item very much."

bow = vectorizer.transform([input_query]).toarray()

bow

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int64)

In [38]:
input_tensor = torch.tensor(bow, dtype=torch.float32)

input_tensor

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [41]:
logits = model(input_tensor)

In [42]:
logits.squeeze()

tensor(0.9850, grad_fn=<SqueezeBackward0>)

In [43]:
sentiment = "positive" if logits.squeeze().item() >= 0.5 else "negative"

In [45]:
print(f"{input_query} : {sentiment}")

I love this item very much. : positive


In [47]:
def predict_sentiment(text, model=model, vectorizer=vectorizer):
    bow = vectorizer.transform([text]).toarray()
    input_tensor = torch.tensor(bow, dtype=torch.float32)
    
    model.eval()
    
    with torch.no_grad():
        logits = model(input_tensor).squeeze()
        
        sentiment = "positive" if logits.item() >= 0.5 else "negative"
        
    print(f"{text} : {sentiment} : {logits.item():.4f}")

In [48]:
predict_sentiment("I love this item very much.")

I love this item very much. : positive : 0.9850


In [49]:
predict_sentiment("Staff are not helful at all.")

Staff are not helful at all. : negative : 0.0000
