In [12]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer

# Load the pre-trained word embeddings
word_embeddings = pd.read_csv('datasets/glove.6B.50d.txt', header=None, sep=' ', index_col=0, quoting=3)
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

# Load the input files
x_train = pd.read_csv('datasets/x_train.csv')
y_train = pd.read_csv('datasets/y_train.csv')['is_positive_sentiment'].values
x_test = pd.read_csv('datasets/x_test.csv')

# Perform feature selection
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove stopwords and rare words
    words = [word for word in text.split() if word in word2vec]
    return ' '.join(words)

x_train['text'] = x_train['text'].apply(preprocess_text)
x_test['text'] = x_test['text'].apply(preprocess_text)

# Convert text to word embeddings
def text_to_embeddings(text):
    words = text.split()
    embeddings = [word2vec[word] for word in words if word in word2vec]
    if len(embeddings) > 0:
        return torch.tensor(np.mean(embeddings, axis=0), dtype=torch.float32)
    else:
        return torch.zeros(50, dtype=torch.float32)

X_train_embeddings = torch.stack([text_to_embeddings(text) for text in x_train['text']])
X_test_embeddings = torch.stack([text_to_embeddings(text) for text in x_test['text']])

# Define the sentiment classifier model
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(SentimentClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        _, (hidden, _) = self.lstm(x.unsqueeze(1))
        out = self.fc(hidden[-1])
        out = self.sigmoid(out)
        return out

# Set hyperparameters
hidden_dim = 128
num_layers = 2
dropout = 0.5
learning_rate = 0.001
num_epochs = 200

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train_embeddings), start=1):
    print(f"Fold {fold}:")
    
    # Split the data into train and validation sets
    X_train, X_val = X_train_embeddings[train_index], X_train_embeddings[val_index]
    y_train_fold, y_val = y_train[train_index], y_train[val_index]

    # Convert labels to PyTorch tensors
    y_train_fold = torch.tensor(y_train_fold, dtype=torch.float32).unsqueeze(1)
    y_val = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

    # Initialize the model
    model = SentimentClassifier(50, hidden_dim, num_layers, dropout)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    for epoch in range(1, num_epochs + 1):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train_fold)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluate the model on the validation set
    with torch.no_grad():
        val_outputs = model(X_val)
        val_preds = (val_outputs > 0.5).float()
        val_accuracy = (val_preds == y_val).float().mean()
        accuracies.append(val_accuracy.item())
        print(f"Validation Accuracy: {val_accuracy.item():.4f}\n")

print(f"Average Cross-Validation Accuracy: {sum(accuracies) / len(accuracies):.4f}")

# Run the model on the test set and save the predicted probabilities
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_embeddings)
    test_probs = test_outputs.squeeze().numpy()

# Save the predicted probabilities to a file
np.savetxt('datasets/yprob_test.txt', test_probs)
print("Predicted probabilities for the test set saved to 'datasets/yprob_test.txt'")

Fold 1:
Epoch [10/200], Loss: 0.6898
Epoch [20/200], Loss: 0.6797
Epoch [30/200], Loss: 0.6535
Epoch [40/200], Loss: 0.6126
Epoch [50/200], Loss: 0.5825
Epoch [60/200], Loss: 0.5639
Epoch [70/200], Loss: 0.5575
Epoch [80/200], Loss: 0.5466
Epoch [90/200], Loss: 0.5440
Epoch [100/200], Loss: 0.5370
Epoch [110/200], Loss: 0.5349
Epoch [120/200], Loss: 0.5289
Epoch [130/200], Loss: 0.5333
Epoch [140/200], Loss: 0.5255
Epoch [150/200], Loss: 0.5233
Epoch [160/200], Loss: 0.5200
Epoch [170/200], Loss: 0.5210
Epoch [180/200], Loss: 0.5168
Epoch [190/200], Loss: 0.5172
Epoch [200/200], Loss: 0.5125
Validation Accuracy: 0.7167

Fold 2:
Epoch [10/200], Loss: 0.6892
Epoch [20/200], Loss: 0.6763
Epoch [30/200], Loss: 0.6424
Epoch [40/200], Loss: 0.6018
Epoch [50/200], Loss: 0.5803
Epoch [60/200], Loss: 0.5630
Epoch [70/200], Loss: 0.5478
Epoch [80/200], Loss: 0.5441
Epoch [90/200], Loss: 0.5364
Epoch [100/200], Loss: 0.5339
Epoch [110/200], Loss: 0.5304
Epoch [120/200], Loss: 0.5258
Epoch [130/20