# adaboost bow

In [5]:
import pandas as pd
import numpy as np

# Implementing the Perceptron and AdaBoostPerceptron classes
class Perceptron:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.w = None
        self.b = None
    
    def train(self, X, y, epochs=1, sample_weights=None):
        n_samples, n_features = X.shape
        self.w = np.random.uniform(-0.01, 0.01, size=n_features)
        self.b = np.random.uniform(-0.01, 0.01)
        
        if sample_weights is None:
            sample_weights = np.ones(n_samples)
        
        for _ in range(epochs):
            for i in range(n_samples):
                if y[i] * (np.dot(X[i], self.w) + self.b) <= 0:
                    update = self.learning_rate * sample_weights[i] * y[i]
                    self.w += update * X[i]
                    self.b += update

    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)

class AdaBoostPerceptron:
    def __init__(self, n_clf=10):
        self.n_clf = n_clf
        self.clfs = []
        self.clf_weights = []
    
    def fit(self, X, y):
        n_samples = len(y)
        D = np.ones(n_samples) / n_samples
        
        for _ in range(self.n_clf):
            clf = Perceptron(learning_rate=1.0)
            clf.train(X, y, epochs=1, sample_weights=D)
            pred = clf.predict(X)
            error = np.dot(D, (pred != y))
            
            if error == 0:
                self.clfs.append(clf)
                self.clf_weights.append(1)
                break
            
            alpha = 0.5 * np.log((1 - error) / error)
            self.clfs.append(clf)
            self.clf_weights.append(alpha)
            
            D *= np.exp(-alpha * y * pred)
            D /= D.sum()

    def predict(self, X):
        clf_preds = np.array([clf.predict(X) for clf in self.clfs])
        final_pred = np.dot(self.clf_weights, clf_preds)
        return np.sign(final_pred)

# Load data
train_data = pd.read_csv("/home/u1472278/ML/Project/project_data/new.bow.train.csv")
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values

test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/new.bow.test.csv")
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

dev_data = pd.read_csv("/home/u1472278/ML/Project/project_data/data/bag-of-words/bow.eval.anon.csv")
X_dev = dev_data.iloc[:, 1:].values

# Train AdaBoost Perceptron
adaboost_perceptron = AdaBoostPerceptron(n_clf=10)
adaboost_perceptron.fit(X_train, y_train)

# Evaluate accuracy on the test set
y_pred_test = adaboost_perceptron.predict(X_test)
test_accuracy = np.mean(y_pred_test == y_test)
print("Test accuracy:", test_accuracy)

# Predict labels for the development set
y_pred_dev = adaboost_perceptron.predict(X_dev)
predicted_labels_int = y_pred_dev.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('submission_adaboost_bow.csv', index=False)


Test accuracy: 0.6506666666666666


# tfidf

In [2]:
import pandas as pd
import numpy as np

# Implementing the Perceptron and AdaBoostPerceptron classes
class Perceptron:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.w = None
        self.b = None
    
    def train(self, X, y, epochs=1, sample_weights=None):
        n_samples, n_features = X.shape
        self.w = np.random.uniform(-0.01, 0.01, size=n_features)
        self.b = np.random.uniform(-0.01, 0.01)
        
        if sample_weights is None:
            sample_weights = np.ones(n_samples)
        
        for _ in range(epochs):
            for i in range(n_samples):
                if y[i] * (np.dot(X[i], self.w) + self.b) <= 0:
                    update = self.learning_rate * sample_weights[i] * y[i]
                    self.w += update * X[i]
                    self.b += update

    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)

class AdaBoostPerceptron:
    def __init__(self, n_clf=10):
        self.n_clf = n_clf
        self.clfs = []
        self.clf_weights = []
    
    def fit(self, X, y):
        n_samples = len(y)
        D = np.ones(n_samples) / n_samples
        
        for _ in range(self.n_clf):
            clf = Perceptron(learning_rate=1.0)
            clf.train(X, y, epochs=1, sample_weights=D)
            pred = clf.predict(X)
            error = np.dot(D, (pred != y))
            
            if error == 0:
                self.clfs.append(clf)
                self.clf_weights.append(1)
                break
            
            alpha = 0.5 * np.log((1 - error) / error)
            self.clfs.append(clf)
            self.clf_weights.append(alpha)
            
            D *= np.exp(-alpha * y * pred)
            D /= D.sum()

    def predict(self, X):
        clf_preds = np.array([clf.predict(X) for clf in self.clfs])
        final_pred = np.dot(self.clf_weights, clf_preds)
        return np.sign(final_pred)

# Load data
train_data = pd.read_csv("/home/u1472278/ML/Project/project_data/new.tfidf.train.csv")
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values

test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/new.tfidf.test.csv")
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

dev_data = pd.read_csv("/home/u1472278/ML/Project/project_data/data/tfidf/tfidf.eval.anon.csv")
X_dev = dev_data.iloc[:, 1:].values

# Train AdaBoost Perceptron
adaboost_perceptron = AdaBoostPerceptron(n_clf=10)
adaboost_perceptron.fit(X_train, y_train)

# Evaluate accuracy on the test set
y_pred_test = adaboost_perceptron.predict(X_test)
test_accuracy = np.mean(y_pred_test == y_test)
print("Test accuracy:", test_accuracy)

# Predict labels for the development set
y_pred_dev = adaboost_perceptron.predict(X_dev)
predicted_labels_int = y_pred_dev.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('submission_adaboost_bow.csv', index=False)


Test accuracy: 0.5488888888888889


# TFIDF misc

In [1]:
import pandas as pd
import numpy as np

# Implementing the Perceptron and AdaBoostPerceptron classes
class Perceptron:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.w = None
        self.b = None
    
    def train(self, X, y, epochs=1, sample_weights=None):
        n_samples, n_features = X.shape
        self.w = np.random.uniform(-0.01, 0.01, size=n_features)
        self.b = np.random.uniform(-0.01, 0.01)
        
        if sample_weights is None:
            sample_weights = np.ones(n_samples)
        
        for _ in range(epochs):
            for i in range(n_samples):
                if y[i] * (np.dot(X[i], self.w) + self.b) <= 0:
                    update = self.learning_rate * sample_weights[i] * y[i]
                    self.w += update * X[i]
                    self.b += update

    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)

class AdaBoostPerceptron:
    def __init__(self, n_clf=10):
        self.n_clf = n_clf
        self.clfs = []
        self.clf_weights = []
    
    def fit(self, X, y):
        n_samples = len(y)
        D = np.ones(n_samples) / n_samples
        
        for _ in range(self.n_clf):
            clf = Perceptron(learning_rate=1.0)
            clf.train(X, y, epochs=1, sample_weights=D)
            pred = clf.predict(X)
            error = np.dot(D, (pred != y))
            
            if error == 0:
                self.clfs.append(clf)
                self.clf_weights.append(1)
                break
            
            alpha = 0.5 * np.log((1 - error) / error)
            self.clfs.append(clf)
            self.clf_weights.append(alpha)
            
            D *= np.exp(-alpha * y * pred)
            D /= D.sum()

    def predict(self, X):
        clf_preds = np.array([clf.predict(X) for clf in self.clfs])
        final_pred = np.dot(self.clf_weights, clf_preds)
        return np.sign(final_pred)

# Load data
train_data = pd.read_csv("/home/u1472278/ML/Project/project_data/tfidf_misc_train.csv")
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values

test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/tfidf_misc_test.csv")
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

dev_data = pd.read_csv("/home/u1472278/ML/Project/project_data/tfidf_misc_eval.csv")
X_dev = dev_data.iloc[:, 1:].values

# Train AdaBoost Perceptron
adaboost_perceptron = AdaBoostPerceptron(n_clf=10)
adaboost_perceptron.fit(X_train, y_train)

# Evaluate accuracy on the test set
y_pred_test = adaboost_perceptron.predict(X_test)
test_accuracy = np.mean(y_pred_test == y_test)
print("Test accuracy:", test_accuracy)

# Predict labels for the development set
y_pred_dev = adaboost_perceptron.predict(X_dev)
predicted_labels_int = y_pred_dev.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('submission_adaboost_tfidf_misc.csv', index=False)


Test accuracy: 0.4875555555555556


# glove+test

In [3]:
import pandas as pd
import numpy as np

# Implementing the Perceptron and AdaBoostPerceptron classes
class Perceptron:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.w = None
        self.b = None
    
    def train(self, X, y, epochs=1, sample_weights=None):
        n_samples, n_features = X.shape
        self.w = np.random.uniform(-0.01, 0.01, size=n_features)
        self.b = np.random.uniform(-0.01, 0.01)
        
        if sample_weights is None:
            sample_weights = np.ones(n_samples)
        
        for _ in range(epochs):
            for i in range(n_samples):
                if y[i] * (np.dot(X[i], self.w) + self.b) <= 0:
                    update = self.learning_rate * sample_weights[i] * y[i]
                    self.w += update * X[i]
                    self.b += update

    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)

class AdaBoostPerceptron:
    def __init__(self, n_clf=10):
        self.n_clf = n_clf
        self.clfs = []
        self.clf_weights = []
    
    def fit(self, X, y):
        n_samples = len(y)
        D = np.ones(n_samples) / n_samples
        
        for _ in range(self.n_clf):
            clf = Perceptron(learning_rate=1.0)
            clf.train(X, y, epochs=1, sample_weights=D)
            pred = clf.predict(X)
            error = np.dot(D, (pred != y))
            
            if error == 0:
                self.clfs.append(clf)
                self.clf_weights.append(1)
                break
            
            alpha = 0.5 * np.log((1 - error) / error)
            self.clfs.append(clf)
            self.clf_weights.append(alpha)
            
            D *= np.exp(-alpha * y * pred)
            D /= D.sum()

    def predict(self, X):
        clf_preds = np.array([clf.predict(X) for clf in self.clfs])
        final_pred = np.dot(self.clf_weights, clf_preds)
        return np.sign(final_pred)

# Load data
train_data = pd.read_csv("/home/u1472278/ML/Project/project_data/glove_misc_train.csv")
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values

test_data = pd.read_csv("/home/u1472278/ML/Project/project_data/glove_misc_test.csv")
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

dev_data = pd.read_csv("/home/u1472278/ML/Project/project_data/glove_misc_eval.csv")
X_dev = dev_data.iloc[:, 1:].values

# Train AdaBoost Perceptron
adaboost_perceptron = AdaBoostPerceptron(n_clf=10)
adaboost_perceptron.fit(X_train, y_train)

# Evaluate accuracy on the test set
y_pred_test = adaboost_perceptron.predict(X_test)
test_accuracy = np.mean(y_pred_test == y_test)
print("Test accuracy:", test_accuracy)

# Predict labels for the development set
y_pred_dev = adaboost_perceptron.predict(X_dev)
predicted_labels_int = y_pred_dev.astype(int)

predicted_labels_df = pd.DataFrame(predicted_labels_int, columns=['label'])
eval_ids = pd.read_csv("data/eval.ids", header=None, names=['example_id'])

submission_df = pd.concat([eval_ids['example_id'], predicted_labels_df], axis=1)

# Save the concatenated DataFrame to a CSV file
submission_df.to_csv('submission_adaboost_glove_misc.csv', index=False)


Test accuracy: 0.6222222222222222
