In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)

class Perceptron:
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate
        self.w = None
        self.b = None
        self.total_updates = 0

    def train(self, X, y, epochs):
        n_samples, n_features = X.shape
        # Initialize weights and bias
        self.w = np.random.uniform(-0.01, 0.01, size=n_features)
        self.b = np.random.uniform(-0.01, 0.01)

        # Training loop
        for _ in range(epochs):
            for i in range(n_samples):
                if y[i] * (np.dot(X[i], self.w) + self.b) <= 0:
                    # Update weights and bias
                    self.w += self.learning_rate * y[i] * X[i]
                    self.b += self.learning_rate * y[i]
                    self.total_updates += 1

        return self

    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)

def split_data(X_train, y_train, fold, num_folds):
    fold_size = len(X_train) // num_folds
    start = fold * fold_size
    end = (fold + 1) * fold_size

    X_val_fold = X_train[start:end]
    y_val_fold = y_train[start:end]

    X_train_fold = np.concatenate([X_train[:start], X_train[end:]])
    y_train_fold = np.concatenate([y_train[:start], y_train[end:]])

    return X_train_fold, y_train_fold, X_val_fold, y_val_fold

def cross_validation(learning_rates, epochs_cv, X_train, y_train):
    results = []

    for eta in learning_rates:
        fold_accuracies = []

        for _ in range(epochs_cv):
            fold_accuracy = 0

            for fold in range(5):
                X_train_fold, y_train_fold, X_val_fold, y_val_fold = split_data(X_train, y_train, fold, 5)

                perceptron = Perceptron(eta)
                perceptron.train(X_train_fold, y_train_fold, epochs=1)
                fold_accuracy += evaluate(X_val_fold, y_val_fold, perceptron)

            fold_accuracy /= 5  # Average accuracy across folds
            fold_accuracies.append(fold_accuracy)

        avg_accuracy = np.mean(fold_accuracies)
        results.append((eta, avg_accuracy))

    best_eta, best_accuracy = max(results, key=lambda x: x[1])
    return best_eta, best_accuracy

def evaluate(X, y, perceptron):
    y_pred = perceptron.predict(X)
    accuracy = np.mean(y_pred == y)
    return accuracy

# Load data
train_data = pd.read_csv("new.bow.train.csv", header=None, skiprows=1)
X_train = train_data.iloc[:, 1:].values.astype(float)
y_train = train_data.iloc[:, 0].values.astype(int)

dev_data = pd.read_csv("data/bag-of-words/bow.eval.anon.csv", header=None, skiprows=1)
X_dev = dev_data.iloc[:, 1:].values.astype(float)
y_dev = dev_data.iloc[:, 0].values.astype(int)

# Define hyperparameters
learning_rates = [1, 0.1, 0.01]
epochs_cv = 10

# Cross-validation to find the best learning rate
best_eta, best_accuracy = cross_validation(learning_rates, epochs_cv, X_train, y_train)
print("Best learning rate found:", best_eta)
print("Best accuracy: ", best_accuracy)

# Train the model with the best learning rate for more epochs
epochs_train = 20
perceptron = Perceptron(best_eta)
best_perceptron = perceptron.train(X_train, y_train, epochs_train)
print("Total number of updates on the training set:", perceptron.total_updates)

# Evaluate the best-performing perceptron on the test set
test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/new.bow.test.csv", header=None, skiprows=1)
X_test = test_data.iloc[:, 1:].values.astype(float)
y_test = test_data.iloc[:, 0].values.astype(int)
test_accuracy = evaluate(X_test, y_test, best_perceptron)
print("Test accuracy using the best-performing perceptron:", test_accuracy)

# Evaluate the best-performing perceptron on the dev set
# dev_accuracy = evaluate(X_dev, y_dev, perceptron)
# print("Dev accuracy:", dev_accuracy)

# Predict labels for the dev set and save to submission file
predicted_labels = best_perceptron.predict(X_dev)
predicted_labels_int = predicted_labels.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
#submission_df.to_csv('submission.csv', index=False)


Best learning rate found: 0.01
Best accuracy:  0.6364914285714285
Total number of updates on the training set: 87955
Test accuracy using the best-performing perceptron: 0.6777777777777778


# Enhanced perceptron

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron

# Load data
train_data = pd.read_csv("new.glove.train.csv", header=None, skiprows=1)
X_train = train_data.iloc[:, 1:].values.astype(float)
y_train = train_data.iloc[:, 0].values.astype(int)

# Normalize and scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Split data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Define and train Perceptron model
perceptron = Perceptron()
perceptron.fit(X_train_split, y_train_split)

# Evaluate model on validation set
y_pred_val = perceptron.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", val_accuracy)

# Train model on full training set
perceptron.fit(X_train_scaled, y_train)

# Load test data
test_data = pd.read_csv("new.glove.test.csv", header=None, skiprows=1)
X_test = test_data.iloc[:, 1:].values.astype(float)
X_test_scaled = scaler.transform(X_test)  # Scale test data using the same scaler

# Predict labels for test set
y_pred_test = perceptron.predict(X_test_scaled)

# Save predictions to submission file
eval_data = pd.read_csv("/home/u1472278/ML/Project/project_data/data/glove/glove.eval.anon.csv", header=None, skiprows=1)
X_eval = eval_data.iloc[:, 1:].values.astype(float)
X_eval_scaled = scaler.transform(X_eval)  # Scale evaluation data using the same scaler

# Predict labels for evaluation set
y_pred_eval = perceptron.predict(X_eval_scaled)

predicted_labels_int = y_pred_eval.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('solution_enhanced_perceptron_glove.csv', index=False)

Validation Accuracy: 0.5528571428571428


In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron

# Load data
train_data = pd.read_csv("/home/u1472278/ML/Project/project_data/glove_misc_train.csv", header=None, skiprows=1)
X_train = train_data.iloc[:, 1:].values.astype(float)
y_train = train_data.iloc[:, 0].values.astype(int)

# Normalize and scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Split data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Define and train Perceptron model
perceptron = Perceptron()
perceptron.fit(X_train_split, y_train_split)

# Evaluate model on validation set
y_pred_val = perceptron.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", val_accuracy)

# Train model on full training set
perceptron.fit(X_train_scaled, y_train)

# Load test data
test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/glove_misc_test.csv", header=None, skiprows=1)
X_test = test_data.iloc[:, 1:].values.astype(float)
X_test_scaled = scaler.transform(X_test)  # Scale test data using the same scaler

# Predict labels for test set
y_pred_test = perceptron.predict(X_test_scaled)

# Save predictions to submission file
eval_data = pd.read_csv("/home/u1472278/ML/Project/project_data/glove_misc_eval.csv", header=None, skiprows=1)
X_eval = eval_data.iloc[:, 1:].values.astype(float)
X_eval_scaled = scaler.transform(X_eval)  # Scale evaluation data using the same scaler

# Predict labels for evaluation set
y_pred_eval = perceptron.predict(X_eval_scaled)

predicted_labels_int = y_pred_eval.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('solution_enhanced_perceptron_glove_misc.csv', index=False)

Validation Accuracy: 0.5528571428571428


In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron

# Load data
train_data = pd.read_csv("/home/u1472278/ML/Project/project_data/bow_misc_train.csv", header=None, skiprows=1)
X_train = train_data.iloc[:, 1:].values.astype(float)
y_train = train_data.iloc[:, 0].values.astype(int)

# Normalize and scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Split data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Define and train Perceptron model
perceptron = Perceptron()
perceptron.fit(X_train_split, y_train_split)

# Evaluate model on validation set
y_pred_val = perceptron.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", val_accuracy)

# Train model on full training set
perceptron.fit(X_train_scaled, y_train)

# Load test data
test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/bow_misc_test.csv", header=None, skiprows=1)
X_test = test_data.iloc[:, 1:].values.astype(float)
X_test_scaled = scaler.transform(X_test)  # Scale test data using the same scaler

# Predict labels for test set
y_pred_test = perceptron.predict(X_test_scaled)

# Save predictions to submission file
eval_data = pd.read_csv("/home/u1472278/ML/Project/project_data/bow_misc_eval.csv", header=None, skiprows=1)
X_eval = eval_data.iloc[:, 1:].values.astype(float)
X_eval_scaled = scaler.transform(X_eval)  # Scale evaluation data using the same scaler

# Predict labels for evaluation set
y_pred_eval = perceptron.predict(X_eval_scaled)

predicted_labels_int = y_pred_eval.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('solution_enhanced_perceptron_bow_misc.csv', index=False)

Validation Accuracy: 0.6594285714285715
