In [13]:
import math
import json
import random
import numpy as np
import pandas as pd

In [14]:
# Read dataset
glove_df_train = pd.read_csv("../project_data/data/glove/glove.train.csv")
glove_df_test = pd.read_csv("../project_data/data/glove/glove.test.csv")
glove_df_eval = pd.read_csv("../project_data/data/glove/glove.eval.anon.csv")

misc_df_train = pd.read_csv("../project_data/data/misc/misc-attributes-train.csv")
misc_df_test = pd.read_csv("../project_data/data/misc/misc-attributes-test.csv")
misc_df_eval = pd.read_csv("../project_data/data/misc/misc-attributes-eval.csv")

# Add label to misc attribute
misc_df_train["label"] = glove_df_train["label"]
misc_df_test["label"] = glove_df_test["label"]
misc_df_eval["label"] = glove_df_eval["label"]

# Add bias
glove_df_train["bias"] = 1
glove_df_test["bias"] = 1
glove_df_eval["bias"] = 1

glove_df_eval

Unnamed: 0,label,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x291,x292,x293,x294,x295,x296,x297,x298,x299,bias
0,1,-4.400295,4.717408,-7.981161,0.582802,6.156548,0.865143,3.980694,13.271554,-5.734209,...,-0.417537,2.444274,-4.106550,-8.098214,-2.555818,-1.243116,1.151501,-6.447711,3.859399,1
1,1,-4.358865,-2.167632,-7.009697,2.813710,13.745421,-1.438060,5.279384,16.380932,-13.599655,...,-3.715164,1.483463,-5.156900,-10.468790,0.202495,5.681094,-3.124561,-12.166863,4.527374,1
2,1,-5.584966,0.501010,-1.244940,2.081082,7.261350,-1.760596,-1.112437,9.394181,-6.677829,...,0.604660,2.254350,3.355893,-5.993471,4.496184,-1.573408,-4.589787,-1.782930,1.634312,1
3,1,-13.807071,-2.762292,-15.260910,3.593135,10.570365,-1.137067,4.011957,23.625479,-9.519619,...,2.428061,3.599397,-4.811395,-11.250913,5.724420,-0.567043,-1.668291,-9.897473,1.486454,1
4,1,-13.159473,-6.476247,-9.394270,-0.055009,17.871582,4.610767,5.522694,29.599262,-5.629232,...,-5.451225,-7.821697,2.730226,-16.286798,5.933157,6.767034,-0.118671,-10.951599,3.276476,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5245,1,-1.847650,-0.982201,-3.223992,0.838675,5.136714,-4.103040,-0.536969,11.362982,-4.183874,...,3.121930,1.301225,-5.088677,-7.643860,1.056108,2.345474,-0.817544,-3.914944,3.014992,1
5246,1,-3.905437,-2.460167,-3.065652,-2.345944,3.024367,-1.337712,0.920609,2.999982,-0.509687,...,-2.717637,-0.901527,0.322936,-3.523556,1.131708,3.314598,0.713785,-1.761670,-0.890852,1
5247,1,-0.558323,-1.632193,-1.525511,-1.881319,4.102118,-2.785541,4.119842,7.296921,1.344908,...,0.858594,-2.504808,0.119857,-6.171572,1.333121,3.166763,0.863815,-2.314376,2.651689,1
5248,1,-7.203439,-2.136733,-8.628859,-0.759335,5.817169,3.166624,2.425720,3.574980,-1.045218,...,2.184598,2.267007,-4.621272,-5.403078,1.864315,7.287250,-3.210342,-4.946033,9.009615,1


In [15]:
def get_max_key_by_value(map):
    max_key = ""
    max_val = float("-inf")

    for key, val in map.items():
        if val > max_val:
            max_val = val
            max_key = key

    # print("map: ", map, "max_key: ", max_key)
    return max_key

def initialize_weights_bias(rand_start, rand_end, feature_count):
    random_number = random.uniform(rand_start, rand_end)

    bias = random_number
    weights = []  # All weights and bias should be same.
    for _ in range(feature_count):
        weights.append(random_number)

    return weights, bias

def predict(example, weights):
    value = np.dot(weights, example)
    return 1 if value > 0 else -1

def test_accuracy(df, weights, store_eval=False):
    total = df.shape[0]
    correct_prediction = 0
    eval_list = []

    for _, row in df.iterrows():
        example = row.tolist()
        actual_label = example[0]  # y
        example = example[1:]  # x

        predicted_label = predict(example, weights)

        if store_eval:
            eval_list.append(predicted_label)

        if predicted_label == actual_label:
            correct_prediction += 1

    # print(f"Test accuracy. Correct Pred: {correct_prediction}, Total: {total}")
    return correct_prediction / total, eval_list

def perceptron(df, learning_rate, weights):
    update_count = 0
    for _, row in df.iterrows():
        example = row.tolist()
        actual_label = example[0]  # y
        example = example[1:]  # x

        value = actual_label * (np.dot(weights, example))

        # update
        if value < 0:
            update_count += 1
            for index in range(len(weights)):
                # w = w + r * y * x
                weights[index] += learning_rate * actual_label * example[index]

    return weights, update_count

In [16]:
rand_start = -0.01
rand_end = 0.01

initial_weights, _ = initialize_weights_bias(
    rand_start=rand_start, rand_end=rand_end, feature_count=glove_df_train.shape[1] - 1
)

accuracy_dict = {}
prediction_list_dict = {}
learning_rates = [1, 0.1, 0.01]

for learning_rate in learning_rates:
    weights, _ = perceptron(df=glove_df_train, learning_rate=learning_rate, weights=initial_weights)

    accuracy, prediction_list = test_accuracy(df=glove_df_eval, weights=weights)
    accuracy_dict[accuracy] = accuracy
    prediction_list_dict[accuracy] = prediction_list

print(f"Accuracy of tree on eval dataset: ", get_max_key_by_value(accuracy_dict))
df = pd.DataFrame(prediction_list_dict[accuracy])
df.to_csv("perceptron_glove_eval_dataset_prediction.csv", index=True, header=False)

Accuracy of tree on eval dataset:  1.0
