In [106]:
#!/usr/bin/env python3

# References:
# https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
# https://www.kaggle.com/marknagelberg/rmsle-function


from collections import defaultdict
import numpy as np
import math
import pandas as pd
import os
import csv
from sklearn.linear_model import LinearRegression, Ridge


# os.chdir("/home/minion/Desktop/ML/HW3")


def process_data(filename, feature_map):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        if j != 0:
            line = line.strip()
            features = line.split(",")
            feat_vec = np.zeros(len(feature_map))
            for i, fv in enumerate(features[1:-1]):  # last one is target
                if (i, fv) in feature_map:  # ignore unobserved features
                    feat_vec[feature_map[i, fv]] = 1

            X.append(feat_vec)
            if features[-1].isdigit():
                Y.append(int(features[-1]))  # fake for testdata

    return np.array(X), np.array(Y)


# A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i, pred in enumerate(y_pred)]
    return (sum(terms_to_sum)/ len(y)) ** 0.5


def binarize(filename):
    field_value_freqs = defaultdict(lambda: defaultdict(int))  # field_id -> value -> freq
    for i, line in enumerate(open(filename)):
        if i != 0:
            line = line.strip()
            features = line.split(",")[1:-1]  # exclude target label
            for i, fv in enumerate(features):
                field_value_freqs[i][fv] += 1

    feature_map = {}
    feature_remap = {}    
    for i, value_freqs in field_value_freqs.items():
        for v in value_freqs:
            k = len(feature_map)  # bias
            feature_map[i, v] = k
            feature_remap[k] = i,v    
    dimension = len(feature_map)  # bias
    print("dimensionality: %d" % dimension)  # feature_map

    return feature_map, feature_remap

In [107]:
# _________________________ Start with my_train and my_dev for calibration.
print("\nExperiment with my_train and my_dev (for calibration)")
# Create feature map.
feature_map, feature_remap = binarize("my_train.csv")

# Binarize data.
train_X, train_Y = process_data("my_train.csv",feature_map)
dev_X, dev_Y = process_data("my_dev.csv",feature_map)

df = pd.DataFrame(dev_X)
print(feature_remap[6])
list(df.iloc[4,:]).count(1)
# train_X


Experiment with my_train and my_dev (for calibration)
dimensionality: 7227
(0, '120')


76

In [110]:
# Run linear regression.
lm = LinearRegression(normalize=True)
lm.fit(train_X, train_Y)

# Get predictions.
dev_predictions = lm.predict(dev_X)

# Here, we should calculate the results with the rmlse function,
# by comparing against dev_Y.
print("rmsle",rmsle(dev_predictions, dev_Y))

rmsle 0.16951898652376277


In [97]:
# _________________________ After calibration, run regression on actual Kaggle data.
print("\nExperiment with train and test (for Kaggle submission)")

# Create feature map.
feature_map, feature_remap = binarize("train.csv")

print(len(feature_map))
# Binarize data.
train_X, train_Y = process_data("train.csv", feature_map)
test_X, _ = process_data("test.csv", feature_map)


# Run linear regression.
lm = LinearRegression()
lm.fit(train_X, train_Y)

# Get predictions.
predictions = lm.predict(test_X)


Experiment with train and test (for Kaggle submission)
dimensionality: 7648
7648


In [98]:
# Once the Kaggle predictions are ready, print them along with the house ids
# to match the submission format.

house_ids = []
for line in open("test.csv"):
    line = line.strip()
    house_id = line.split(",")[0]
    if house_id.isdigit():
        house_ids.append(int(house_id))


counter = 0
results = dict(zip(house_ids, predictions))
with open('predictions.csv', 'w') as csv_file:
    csv_file.write("Id,SalePrice\n")  ## Issue here: the string is getting printed with quotes, but it should print without.
    for key, value in results.items():
        csv_file.write(f"{key}, {value}\n")


# Extra code, probably not needed now.
#     print(rmsle(dev_Y, predictions))
#     print(np.sqrt(np.mean((predictions - log_yDev) ** 2)))

#     from sklearn.metrics import mean_squared_error
#     from math import sqrt
#     rmse = sqrt(mean_squared_error(log_yDev, predictions))
#     print(rmse)

#     lm = LinearRegression()
#     lm.fit(xTrain, yTrain)

#     predictions = lm.predict(xDev)
#     print(np.sqrt(np.mean((np.log(predictions) - np.log(yDev)) ** 2)))
#     rmsle = sqrt(mean_squared_error(np.log(predictions), np.log(yDev)))
#     print(rmsle)

#     lm = LinearRegression()
#     lm.fit(xTrain, yTrain)
#     predictions = lm.predict(xTest)