In [7]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from collections import OrderedDict

# Load the pre-trained word embeddings
word_embeddings = pd.read_csv('datasets/glove.6B.50d.txt', header=None, sep=' ', index_col=0, quoting=3)
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

# Load the input files
x_train = pd.read_csv('datasets/x_train.csv')
y_train = pd.read_csv('datasets/y_train.csv')['is_positive_sentiment'].values
x_test = pd.read_csv('datasets/x_test.csv')

# Convert text to word embeddings
def text_to_embeddings(text):
    words = text.lower().split()
    embeddings = [word2vec[word] for word in words if word in word2vec]
    if len(embeddings) > 0:
        return torch.tensor(np.mean(embeddings, axis=0), dtype=torch.float32)
    else:
        return torch.zeros(50, dtype=torch.float32)

X_train_embeddings = torch.stack([text_to_embeddings(text) for text in x_train['text']])
X_test_embeddings = torch.stack([text_to_embeddings(text) for text in x_test['text']])

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train_embeddings), start=1):
    print(f"Fold {fold}:")
    
    # Split the data into train and validation sets
    X_train, X_val = X_train_embeddings[train_index], X_train_embeddings[val_index]
    y_train_fold, y_val = y_train[train_index], y_train[val_index]

    # Convert labels to PyTorch tensors
    y_train_fold = torch.tensor(y_train_fold, dtype=torch.float32).unsqueeze(1)
    y_val = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

    # Initialize the model
    model = LogisticRegression(X_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Train the model
    num_epochs = 100
    for epoch in range(1, num_epochs + 1):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train_fold)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluate the model on the validation set
    with torch.no_grad():
        val_outputs = model(X_val)
        val_preds = (val_outputs > 0.5).float()
        val_accuracy = (val_preds == y_val).float().mean()
        accuracies.append(val_accuracy.item())
        print(f"Validation Accuracy: {val_accuracy.item():.4f}\n")

print(f"Average Cross-Validation Accuracy: {sum(accuracies) / len(accuracies):.4f}")

# Run the model on the test set and save the predicted probabilities
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_embeddings)
    test_probs = test_outputs.squeeze().numpy()

# Save the predicted probabilities to a file
np.savetxt('datasets/yprob_test.txt', test_probs)
print("Predicted probabilities for the test set saved to 'datasets/yprob_test.txt'")

Fold 1:
Epoch [10/100], Loss: 0.6742
Epoch [20/100], Loss: 0.6432
Epoch [30/100], Loss: 0.6225
Epoch [40/100], Loss: 0.6083
Epoch [50/100], Loss: 0.5978
Epoch [60/100], Loss: 0.5899
Epoch [70/100], Loss: 0.5836
Epoch [80/100], Loss: 0.5785
Epoch [90/100], Loss: 0.5743
Epoch [100/100], Loss: 0.5708
Validation Accuracy: 0.7104

Fold 2:
Epoch [10/100], Loss: 0.6546
Epoch [20/100], Loss: 0.6264
Epoch [30/100], Loss: 0.6077
Epoch [40/100], Loss: 0.5947
Epoch [50/100], Loss: 0.5851
Epoch [60/100], Loss: 0.5781
Epoch [70/100], Loss: 0.5727
Epoch [80/100], Loss: 0.5685
Epoch [90/100], Loss: 0.5652
Epoch [100/100], Loss: 0.5625
Validation Accuracy: 0.7063

Fold 3:
Epoch [10/100], Loss: 0.6559
Epoch [20/100], Loss: 0.6255
Epoch [30/100], Loss: 0.6064
Epoch [40/100], Loss: 0.5935
Epoch [50/100], Loss: 0.5842
Epoch [60/100], Loss: 0.5774
Epoch [70/100], Loss: 0.5721
Epoch [80/100], Loss: 0.5679
Epoch [90/100], Loss: 0.5646
Epoch [100/100], Loss: 0.5618
Validation Accuracy: 0.7500

Fold 4:
Epoch [1