# logreg_train.py

In [1]:
import MyLogisticRegression as LR
import pandas
import describe
import sys

In [2]:
def normalize_data(df):
    for c in df.columns: #Normalize data
        describe_df = describe.describe(df, [c])
        m = describe_df.loc['mean'][0]
        std = describe_df.loc['std'][0]
        df[c] = df[c].apply(lambda x: (x - m)/std)
    return df

In [3]:
def prepare_data(dataset):
    labels = dataset['Hogwarts House'].unique().tolist()
    trainset_df = dataset[dataset.columns[6:]].fillna(0)
    trainset_df = normalize_data(trainset_df)
    trainset = trainset_df.to_numpy()
    real_values = dataset['Hogwarts House'].replace(labels, [0, 1, 2, 3]).to_numpy()
    return (trainset, real_values, labels)

In [7]:
if __name__ == "__main__":
   # if len(sys.argv) != 2:
    #    print("USAGE: logreg_train.py <CSV FILE>")
    dataset = pandas.read_csv(sys.argv[1])
    X, y, labels = prepare_data(dataset)
    lr = LR.mylogisticregression()
    lr.fit(X, y)
    lr.train(len(labels), iterations=1000, alpha=0.01, tolerance=0.0005)
    with open("weights.txt", "w") as f:
        for classifier in lr.classifiers:
            for classi in classifier:
                f.write(f"{classi} ")
            f.write("\n")

# logreg_predict.py

In [1]:
import pandas
import sys
import describe
import numpy as np
import MyLogisticRegression as LR

In [2]:
def normalize_data(df):
    for c in df.columns: #Normalize data
        describe_df = describe.describe(df, [c])
        m = describe_df.loc['mean'][0]
        std = describe_df.loc['std'][0]
        df[c] = df[c].apply(lambda x: (x - m)/std)
    return df

In [3]:
def prepare_data(dataset):
    labels = ['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff']
    test_df = dataset[dataset.columns[6:]].fillna(0)
    test_df = normalize_data(test_df)
    test = test_df.to_numpy()
    X = np.ones(shape=(test.shape[0], test.shape[1] + 1))
    X[:, 1:] = test
    return (X, labels)

In [4]:
def get_theta_from_file(filename):
    with open(filename, "r") as f:
        weights = f.read()
    theta = np.zeros(shape=(len(labels), X.shape[1]))
    for idx1, label_weights in enumerate(weights.strip().split("\n")):
        for idx2, w in enumerate(label_weights.strip().split(" ")):
            theta[idx1][idx2] = w
    return theta

In [5]:
sys.argv[1] = "datasets/dataset_test.csv"

In [6]:
sys.argv[2] = "weights.txt"

In [9]:
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("USAGE: logreg_predict.py <CSV FILE> <WEIGHTS FILE>")
    else:
        dataset = pandas.read_csv(sys.argv[1])
        X, labels = prepare_data(dataset)
        theta = get_theta_from_file(sys.argv[2])
        lr = LR.mylogisticregression()
        y = lr.predict(X, theta)
        data = {
            "Hogwarts House":[]
        }
        for i in y:
            data['Hogwarts House'].append(labels[i])
        houses = pandas.DataFrame(data)
        houses.to_csv("houses.csv", index=False)