In [35]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # For feature engineering

# Load the input files
x_train = pd.read_csv('datasets/x_train.csv')
y_train = pd.read_csv('datasets/y_train.csv')
x_test = pd.read_csv('datasets/x_test.csv')

# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000)  # Set the maximum number of features to 10,000

# Fit and transform the training data
X_train = tfidf.fit_transform(x_train['text'])
X_train = X_train.toarray()

# Transform the test data
X_test = tfidf.transform(x_test['text'])
X_test = X_test.toarray()

In [15]:
# Viewing shape of input data
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
# print(np.array(X_train)) # View new feature represented data

(2400, 2)
(2400, 1)
(600, 2)


In [21]:
# Viewing shape of processed data
print(X_train.shape[0])
print(y_train.shape[0])

2400
2400


In [36]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def cost_function(theta, X, y):
    m = len(y)
    h = sigmoid(X @ theta) # @ = matrix multiplication
    J = (-1 / m) * (y.T @ np.log(h) + (1 - y).T @ np.log(1 - h))
    return J[0]

def gradient_descent(theta, X, y, alpha, num_iters):
    m = len(y)
    J_history = np.zeros(num_iters) # J_history = history of the cost function J per iteration

    for i in range(num_iters):
        h = sigmoid(X @ theta)
        theta = theta - (alpha / m) * (X.T @ (h - y.reshape(-1, 1)))
        J_history[i] = cost_function(theta, X, y)

    return theta, J_history

In [37]:
# Train the Logistic Regression model
theta = np.zeros((X_train.shape[1], 1))
alpha = 0.01 # learning rate/step size
num_iters = 400
theta, J_history = gradient_descent(theta, X_train, y_train['is_positive_sentiment'].values, alpha, num_iters)

# Generate predicted probabilities for the test set
y_prob_test = sigmoid(X_test @ theta)

# Convert probabilities to binary predictions (0 or 1)
# y_pred_test = (y_prob_test >= 0.5).astype(int)
# np.savetxt('yprob_test.txt', y_pred_test, fmt='%d')

# Save the predicted labels
np.savetxt('yprob_test.txt', y_prob_test)

In [33]:
# Cross-validation for testing the model. Currently very poor results, likely due to text input being such short snippets 
import numpy as np
from sklearn.model_selection import train_test_split

# Load the training data
x_train = pd.read_csv('datasets/x_train.csv')
y_train = pd.read_csv('datasets/y_train.csv')['is_positive_sentiment'].values

# Split the training set in half
X_train, X_val, y_train, y_val = train_test_split(x_train['text'], y_train, test_size=0.5, random_state=42)

# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000)

# Fit and transform the training and validation data
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

# Train the Logistic Regression model
theta = np.zeros((X_train_tfidf.shape[1], 1))
alpha = 0.01
num_iters = 400
theta, J_history = gradient_descent(theta, X_train_tfidf.toarray(), y_train, alpha, num_iters)

# Evaluate on the validation set
y_val_prob = sigmoid(X_val_tfidf.toarray() @ theta)
y_val_pred = (y_val_prob >= 0.5).astype(int)
val_accuracy = np.mean(y_val_pred == y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Accuracy: 0.4993
