In [1]:
# imports
import re
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [134]:
%%bash
pip install missingno

Collecting missingno
  Downloading https://files.pythonhosted.org/packages/57/eb/9d7d55ceec57e0e374e70e9ad8d16795ba91960a3c987f3b5ee71d3e8e4d/missingno-0.4.1-py3-none-any.whl
Installing collected packages: missingno
Successfully installed missingno-0.4.1


twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


In [135]:
from operator import add
from collections import defaultdict
import csv
from pandas.plotting import scatter_matrix
from scipy.stats import chi2_contingency
import missingno as msno

In [136]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "proj_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [7]:
# shuf is a linux commands that gets a random permutation of the lines of a file
!shuf -n 100000 data/train.txt > data/sample.txt

/bin/sh: line 1: 27590 Killed                  shuf -n 100000 data/train.txt > data/sample.txt


In [8]:
rawRDD = spark.read.csv("data/sample.txt", header=False, sep="\t").rdd
dataRDD = rawRDD.map(lambda row: ([None if el is None else int(el) for el in row[1:14]] + row[14:]), int(row[0])))

In [9]:
sample = dataRDD.takeSample(withReplacement=False, num=10000)

numeric_columns = np.array([pair[0][:13] for pair in sample], dtype=np.float)
numeric_df = pd.DataFrame(numeric_columns)

category_columns = np.array([pair[0][13:] for pair in sample])
category_df = pd.DataFrame(category_columns)

labels = np.array([pair[1] for pair in sample])

# Testing for Hashing Trick

In [21]:
# very simple hashing trick...
# takes array of features, whose values are a hex string
# converts hex string to numerical representation and reduce to N bits
# returns a new np.array of reduced feature space
def featureHash(data, N):
    reducedFeatures = [int(x,16) % (2**N) for x in data]
    return np.array(reducedFeatures)

In [26]:
# hash to 16 bits to reduce feature space
#hash_mod = [x % (2**16) for x in hash_test]
hash_mod = featureHash(category_df[0], 16)
#print(hash_mod)


In [137]:
# take a row of features and the label
# leave features 0-13 the same (already integers), covert and hash features 
def hashTest(features, label, N):
    newFeatures = list(features[0:13])
    for i in range(14, len(features)):
        if features[i] == None:
            newFeatures += list([None])
        else:
            newFeatures += list([int(features[i], 16) % 2**N])
    yield (newFeatures, label)

In [138]:
# convert each categorical column to used hashed data
#hashRDD = dataRDD.map(lambda row: hashRDD(row[0], row[1], 16))

#hashRDD = dataRDD.map(lambda r: (r[0], r[1]))

hashRDD = dataRDD.flatMap(lambda r: hashTest(r[0], r[1], 16))

print(hashRDD.take(10))

#print(dataRDD.take(10))

[([1, 1, 5, 0, 1382, 4, 15, 2, 181, 1, 2, None, 2, 27803, 24886, 9156, 15512, 52431, 38328, 46434, 58640, 21764, 40088, 49508, 42486, 59119, 45978, 25319, 30322, 5817, 52681, 10909, 6476, None, 7883, 1156, 13319, 56598], 0), ([2, 0, 44, 1, 102, 8, 2, 2, 4, 1, 1, None, 4, 36, 63461, 19671, 15512, 37605, 64704, 14452, 58640, 58875, 18163, 18918, 1417, 31222, 46541, 15201, 16580, 18032, 52681, 44522, 8734, None, 7883, 16011, 13319, 13909], 0), ([2, 0, 1, 14, 767, 89, 4, 2, 245, 1, 3, 3, 45, 40028, 39030, 57729, 15512, 52431, 1185, 14452, 58640, 58507, 24721, 500, 23087, 14991, 4333, 13400, 56959, 4493, None, None, 50278, 25323, 7883, 15452, None, None], 0), ([None, 893, None, None, 4392, None, 0, 0, 0, None, 0, None, None, 43334, 32360, 55030, 15512, 37605, 26779, 14452, 58640, 17211, 57236, 26570, 63206, 14991, 4644, 26639, 51023, 13570, None, None, 23718, None, 7883, 41802, None, None], 0), ([3, -1, None, 0, 2, 0, 3, 0, 0, 1, 1, None, 0, 41629, 35003, 61357, 15512, 35773, 41557, 14452, 

# Testing for "Homegrown" Logistic Regression

In [None]:
def GDUpdate_wReg(dataRDD, W, learningRate = 0.1, regType = None, regParam = 0.1):
    """
    Perform one gradient descent step/update with ridge or lasso regularization.
    Args:
        dataRDD - tuple of (features_array, y)
        W       - (array) model coefficients with bias at index 0
        learningRate - (float) defaults to 0.1
        regType - (str) 'ridge' or 'lasso', defaults to None
        regParam - (float) regularization term coefficient
    Returns:
        model   - (array) updated coefficients, bias still at index 0
    """
    # augmented data
    augmentedData = dataRDD.map(lambda x: (np.append([1.0], x[0]), x[1]))
    
    new_model = None
    #################### YOUR CODE HERE ###################
    # get the size of data
    count = augmentedData.count()
    
    # gradent descent
    grad = augmentedData.map(lambda x: 2.0*x[0].dot((W.dot(x[0]) - x[1]))) \
                        .reduce(lambda a,b: (a + b))
    grad /= count
    
    # ridge
    if regType == "ridge":
        grad[1:] += 2.0*regParam*W[1:]
    # lasso
    else:
        grad[1:] += regParam*np.sign(W[1:])

    # update the coefficients
    new_model = W - learningRate*grad
    
    ################## (END) YOUR CODE ####################
    return new_model

In [None]:
# part d - ridge/lasso gradient descent function
def GradientDescent_wReg(trainRDD, testRDD, wInit, nSteps = 20, learningRate = 0.1,
                         regType = None, regParam = 0.1, verbose = False):
    """
    Perform nSteps iterations of regularized gradient descent and 
    track loss on a test and train set. Return lists of
    test/train loss and the models themselves.
    """
    # initialize lists to track model performance
    train_history, test_history, model_history = [], [], []
    
    # perform n updates & compute test and train loss after each
    model = wInit
    for idx in range(nSteps):  
        # update the model
        model = GDUpdate_wReg(trainRDD, model, learningRate, regType, regParam)
        
        # keep track of test/train loss for plotting
        train_history.append(OLSLoss(trainRDD, model))
        test_history.append(OLSLoss(testRDD, model))
        model_history.append(model)
        
        # console output if desired
        if verbose:
            print("----------")
            print(f"STEP: {idx+1}")
            print(f"training loss: {training_loss}")
            print(f"test loss: {test_loss}")
            print(f"Model: {[round(w,3) for w in model]}")
    return train_history, test_history, model_history