# bow

In [2]:
import numpy as np
import pandas as pd

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

# def compute_gradient(X, y, w, C):
#     m = X.shape[0]
#     y_pred = sigmoid(np.dot(X, w)) * 2 - 1  # Transform sigmoid output to -1 and +1
#     gradient = np.dot(X.T, (y_pred - y)) / m + C * w
#     return gradient
def compute_svm_gradient(X, y, w, C):
    m = X.shape[0]
    distances = 1 - y * (np.dot(X, w))
    dw = np.zeros(len(w))
    for ind, d in enumerate(distances):
        if max(0, d) == 0:
            di = w
        else:
            di = w - (C * y[ind] * X[ind])
        dw += di
    dw = dw/m  # Average
    return dw
def logistic_regression_SGD(X_train, y_train, learning_rate, epochs, C, batch_size=32):
    w = np.zeros(X_train.shape[1])
    num_samples = X_train.shape[0]
    
    for epoch in range(epochs):
        indices = np.random.permutation(num_samples)
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]
        
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
            
            gradient = compute_gradient(X_batch, y_batch, w, C)
            w -= learning_rate * gradient

        learning_rate /= (1 + epoch)  # Decaying learning rate

    return w

def predict(X, weights):
    z = np.dot(X, weights)
    return np.where(sigmoid(z) >= 0.5, 1, -1)

def calculate_metrics(y_true, y_pred):
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    false_positives = np.sum((y_pred == 1) & (y_true == -1))
    false_negatives = np.sum((y_pred == -1) & (y_true == 1))
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

def calculate_accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions
    return accuracy

def k_fold_split(num_samples, k=5):
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    fold_sizes = np.full(k, num_samples // k, dtype=int)
    fold_sizes[:num_samples % k] += 1
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        folds.append(indices[start:stop])
        current = stop
    return folds

def perform_cross_validation(X, y, k, learning_rates, Cs, epochs, batch_size):
    num_samples = len(y)
    folds = k_fold_split(num_samples, k)
    best_params = {}
    best_f1 = -np.inf

    for gamma in learning_rates:
        for C in Cs:
            f1_scores = []

            for i in range(k):
                test_idx = folds[i]
                train_idx = np.hstack([folds[j] for j in range(k) if j != i])

                X_train, X_valid = X[train_idx], X[test_idx]
                y_train, y_valid = y[train_idx], y[test_idx]
                
                weights = logistic_regression_SGD(X_train, y_train, gamma, epochs, C, batch_size)
                y_pred = predict(X_valid, weights)
                
                _, _, f1 = calculate_metrics(y_valid, y_pred)
                f1_scores.append(f1)

            avg_f1 = np.mean(f1_scores)
            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_params = {'gamma': gamma, 'C': C}

    return best_params

# Load datasets
train_data = pd.read_csv('/home/u1472278/ML/Project/project_data/new.bow.train.csv')
dev_data = pd.read_csv('/home/u1472278/ML/Project/project_data/data/bag-of-words/bow.eval.anon.csv')
test_data = pd.read_csv('/home/u1472278/ML/Project/project_data/new.bow.test.csv')

X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values
X_dev = dev_data.iloc[:, 1:].values
y_dev = dev_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

# Hyperparameters and training configuration
learning_rates = [0.1, 0.01, 0.001]  # Simplified hyperparameter space
Cs = [0.01, 0.1, 1]
epochs = 50
batch_size = 100
k = 5  # Number of folds

# Perform cross-validation to find the best hyperparameters
best_hyperparameters = perform_cross_validation(X_train, y_train, k, learning_rates, Cs, epochs, batch_size)
print("Best Hyperparameters:", best_hyperparameters)

# Train final model on the full training data
final_weights = logistic_regression_SGD(X_train, y_train, best_hyperparameters['gamma'], epochs, best_hyperparameters['C'], batch_size)

# Predict and evaluate on the dev set
y_pred_dev = predict(X_dev, final_weights)
dev_precision, dev_recall, dev_f1 = calculate_metrics(y_dev, y_pred_dev)
dev_accuracy = calculate_accuracy(y_dev, y_pred_dev)

predicted_labels_int = y_pred_dev.astype(int)
predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('submission_svm_new_bow.csv', index=False)


print(f"Dev Precision: {dev_precision:.4f}")
print(f"Dev Recall: {dev_recall:.4f}")
print(f"Dev F1 Score: {dev_f1:.4f}")
print(f"Dev Accuracy: {dev_accuracy:.4f}")

# Predict and evaluate on the test set
y_pred_test = predict(X_test, final_weights)
test_precision, test_recall, test_f1 = calculate_metrics(y_test, y_pred_test)
test_accuracy = calculate_accuracy(y_test, y_pred_test)

print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Best Hyperparameters: {'gamma': 0.1, 'C': 0.01}
Dev Precision: 1.0000
Dev Recall: 0.4844
Dev F1 Score: 0.6526
Dev Accuracy: 0.4844
Test Precision: 0.6973
Test Recall: 0.7107
Test F1 Score: 0.7040
Test Accuracy: 0.7107


# Glove

In [2]:
import numpy as np
import pandas as pd

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def compute_gradient(X, y, w, C):
    m = X.shape[0]
    y_pred = sigmoid(np.dot(X, w)) * 2 - 1  # Transform sigmoid output to -1 and +1
    gradient = np.dot(X.T, (y_pred - y)) / m + C * w
    return gradient

def logistic_regression_SGD(X_train, y_train, learning_rate, epochs, C, batch_size=32):
    w = np.zeros(X_train.shape[1])
    num_samples = X_train.shape[0]
    
    for epoch in range(epochs):
        indices = np.random.permutation(num_samples)
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]
        
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
            
            gradient = compute_gradient(X_batch, y_batch, w, C)
            w -= learning_rate * gradient

        learning_rate /= (1 + epoch)  # Decaying learning rate

    return w

def predict(X, weights):
    z = np.dot(X, weights)
    return np.where(sigmoid(z) >= 0.5, 1, -1)

def calculate_metrics(y_true, y_pred):
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    false_positives = np.sum((y_pred == 1) & (y_true == -1))
    false_negatives = np.sum((y_pred == -1) & (y_true == 1))
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

def calculate_accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions
    return accuracy

def k_fold_split(num_samples, k=5):
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    fold_sizes = np.full(k, num_samples // k, dtype=int)
    fold_sizes[:num_samples % k] += 1
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        folds.append(indices[start:stop])
        current = stop
    return folds

def perform_cross_validation(X, y, k, learning_rates, Cs, epochs, batch_size):
    num_samples = len(y)
    folds = k_fold_split(num_samples, k)
    best_params = {}
    best_f1 = -np.inf

    for gamma in learning_rates:
        for C in Cs:
            f1_scores = []

            for i in range(k):
                test_idx = folds[i]
                train_idx = np.hstack([folds[j] for j in range(k) if j != i])

                X_train, X_valid = X[train_idx], X[test_idx]
                y_train, y_valid = y[train_idx], y[test_idx]
                
                weights = logistic_regression_SGD(X_train, y_train, gamma, epochs, C, batch_size)
                y_pred = predict(X_valid, weights)
                
                _, _, f1 = calculate_metrics(y_valid, y_pred)
                f1_scores.append(f1)

            avg_f1 = np.mean(f1_scores)
            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_params = {'gamma': gamma, 'C': C}

    return best_params

# Load datasets
train_data = pd.read_csv('/home/u1472278/ML/Project/project_data/new.glove.train.csv')
dev_data = pd.read_csv('/home/u1472278/ML/Project/project_data/data/glove/glove.eval.anon.csv')
test_data = pd.read_csv('/home/u1472278/ML/Project/project_data/new.glove.test.csv')

X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values
X_dev = dev_data.iloc[:, 1:].values
y_dev = dev_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

# Hyperparameters and training configuration
learning_rates = [0.1, 0.01, 0.001]  # Simplified hyperparameter space
Cs = [0.01, 0.1, 1]
epochs = 100
batch_size = 32
k = 5  # Number of folds

# Perform cross-validation to find the best hyperparameters
best_hyperparameters = perform_cross_validation(X_train, y_train, k, learning_rates, Cs, epochs, batch_size)
print("Best Hyperparameters:", best_hyperparameters)

# Train final model on the full training data
final_weights = logistic_regression_SGD(X_train, y_train, best_hyperparameters['gamma'], epochs, best_hyperparameters['C'], batch_size)

# Predict and evaluate on the dev set
y_pred_dev = predict(X_dev, final_weights)
dev_precision, dev_recall, dev_f1 = calculate_metrics(y_dev, y_pred_dev)
dev_accuracy = calculate_accuracy(y_dev, y_pred_dev)

predicted_labels_int = y_pred_dev.astype(int)
predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('submission_svm_new_glove.csv', index=False)


print(f"Dev Precision: {dev_precision:.4f}")
print(f"Dev Recall: {dev_recall:.4f}")
print(f"Dev F1 Score: {dev_f1:.4f}")
print(f"Dev Accuracy: {dev_accuracy:.4f}")

# Predict and evaluate on the test set
y_pred_test = predict(X_test, final_weights)
test_precision, test_recall, test_f1 = calculate_metrics(y_test, y_pred_test)
test_accuracy = calculate_accuracy(y_test, y_pred_test)

print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Best Hyperparameters: {'gamma': 0.001, 'C': 0.1}
Dev Precision: 1.0000
Dev Recall: 0.4924
Dev F1 Score: 0.6599
Dev Accuracy: 0.4924
Test Precision: 0.6434
Test Recall: 0.6345
Test F1 Score: 0.6389
Test Accuracy: 0.6529
