# Dividing up the data

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import scipy.io
ind_file = open('FilmMissingIndices.txt', 'r')
with open('FilmMissingIndicesTrueValueUnicode.txt', newline='', encoding='utf16') as f:
    t_val_file = list(map(lambda x: float(x.strip()), f.readlines()))
mat = scipy.io.loadmat('FilmPositiveNegative.mat')
missing_indices = np.array([[int(s.split()[0]), int(s.split()[1])] for s in ind_file.readlines()])
data = mat['FilmPositiveNegative']
len(t_val_file)

65536

In [3]:
missing_indices.shape

(1350000, 2)

In [4]:
print(data.shape)
print(missing_indices.shape)

(users, films) = data.shape
(height, width) = (users, films)

(3000, 50)
(1350000, 2)


In [5]:
from sklearn import preprocessing 
sdata = preprocessing.scale(data)

### Rearrange data

In [6]:
from numpy import ndarray as nd 
new_l = []
for i in range(1, height+1):
    new_li = []
    for j in range(1, width+1):
        first = data[i-1][j-1] if data[i-1][j-1] > 0 else -data[i-1][j-1]
        snd   = 1 if data[i-1][j-1] > 0 else 0
        new_li.append((i, first, snd))
    new_l.append(new_li)
new_arr = np.array(new_l)  
print(new_arr.shape)
nnew_arr = nd.flatten(new_arr)
# nnew_arr.shape
ndata = nnew_arr.reshape(150000,3)
ndata.shape
# ndata=nnew_arr 
# ndata.shape

(3000, 50, 3)


(150000, 3)

This means that we have 3000 users and 50 films. We need to divide up this into training set, development set and test set. 
      * training set (70%) -- (2100,50)
      * development set (20%) -- (600, 50)
      * testing set (10%) -- (300, 50)

In [7]:
m_train = ndata[:105000]
m_dev   = ndata[105000:135000]
m_test  = ndata[135000:150000]

ind_train  = missing_indices[:105000]
tval_train = t_val_file[:105000]

ind_dev   = missing_indices[105000:135000]
tval_dev  = t_val_file[105000:135000]


ind_test  = missing_indices[135000:150000]
tval_test = t_val_file[135000:150000]

ind_unseen   = missing_indices[150000:]
tval_unseen = t_val_file[150000:]

(h, w)= m_train.shape
m_train[:,2]

array([1, 1, 1, ..., 0, 0, 0])

### Perform normalization

# Models

We will consider three models
  * K-nearest neighbor
  * The perceptron
  * Gradient descent with squared loss
For each of these models, we will choose a list of settings of hyperparameters. 

For each setting in the list, we will train our model
using that setting. We will compute error rate on development data, and after all settings are done,

We will choose one with smallest error rate on development data. We will then evaluate this model on test data to estimate future performance

### K-nearest neighbor

The only hyperparameter to tune in this case is the K. 

In [8]:
from sklearn import neighbors
hyper_params = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# calculate all the models for each param setting
models = []
for k in hyper_params:
    clf = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform', algorithm='ball_tree')
    models.append(clf)

# train each model on training data
for m in models:
    m.fit(m_train[:, :2], m_train[:, 2])

It seems that if we do predict_proba, we can choose the last element in the list of results, which contains [p, 1- p] for each example, and so, we can just choose the first or second element of the list. This can then be the probability I calculate the squared error for, for each model and choose one with the least error. Another alternative would be to use the score function from sklearn, and choose the model that scores highest.

In [9]:
from sklearn.metrics import mean_squared_error

In [10]:
scores = []
main_model = [models[1], 0]
for m in models:
    main_model[1] = main_model[0].score(m_dev[:, :2], m_dev[:, 2])
    y_pred = m.predict_proba(m_dev[:, :2])[:, 1]
    y_true = m_dev[:, 2]
    s = mean_squared_error(y_true, y_pred, squared=False)
    scores.append(s)
    if s > main_model[1]:
        main_model = [m, s]
    
scores 
main_model[1]

0.5421893888055477

In [11]:
mm_model = main_model[0]
y_pred = m.predict_proba(m_test[:, :2])[:, 1]
y_true = m_test[:, 2]
s = mean_squared_error(y_true, y_pred, squared=False)
s

0.5425673537297774

### Perceptron

The hyperparameter in this case is the number of iterations until convergence. Moreover, we should shuffle per iteration of the algorithm

Moreover, we should probably consider two different version: regularized vs non-regularized perceptron

Maybe also consider not shuffling and see how that impacts its behavior

Similarly, early stopping 

In [12]:
class PerceptronHParam:
    def __init__(self, iterations, regular, shuffled, estop):
        self.iter     = iterations
        self.reg      = regular 
        self.shuffled = shuffled
        self.estop    = estop 

In [13]:
hyperparams_1000 = [PerceptronHParam(1000, True, True, True)
                   , PerceptronHParam(1000, False, True, True)
                   , PerceptronHParam(1000, False, True, False)
                   , PerceptronHParam(1000, False, False, False)
                   , PerceptronHParam(1000, True, False, True)
                   , PerceptronHParam(1000, True, False, False)]

In [14]:
from sklearn.linear_model import Perceptron 
def make_model(p, reg=None, alpha=None):
    clf = None
    if p.reg:
        if p.shuffled:
            if p.estop:
                clf = Perceptron(max_iter=p.iter, penalty=reg, alpha=alpha, early_stopping=True)
            else:
                clf = Perceptron(max_iter=p.iter, penalty=reg, alpha=alpha)
        else:
            clf = Perceptron(max_iter=p.iter, penalty=reg, alpha=alpha, early_stopping=p.estop, shuffle=False)
    else:
        clf = Perceptron(max_iter=p.iter, early_stopping=p.estop, shuffle=p.shuffled)
    return clf    

That is, given a set of hyperparameters, make_model will construct a model

In [15]:
h_iters = [1000, 1500, 2000]
def make_hparams_per_iter(iters):
    return [PerceptronHParam(iters, True, True, True)
                   , PerceptronHParam(iters, False, True, True)
                   , PerceptronHParam(iters, False, True, False)
                   , PerceptronHParam(iters, False, False, False)
                   , PerceptronHParam(iters, True, False, True)
                   , PerceptronHParam(iters, True, False, False)]
hparam_objs = [make_hparams_per_iter(it) for it in h_iters]

In [16]:
models = []
regs   = [None, 'l2','l1','elasticnet']
alphas = [0.0001, 0.001, 0.01]
for ob_l in hparam_objs:
    for p in ob_l: 
        for r in regs:
            for a in alphas:
                classifier = make_model(p, r, a)
                models.append(classifier)

Now that we've set up our hyperparameters, we can proceed to train, develop and test

In [92]:
for m in models:
    m.fit(m_train[:, :2], m_train[:, 2])   

In [128]:
scores = []
main_model = [models[1], 0]
for m in models:
    main_model[1] = main_model[0].score(m_dev[:, :2], m_dev[:, 2])
    y_pred = m.predict_proba(m_dev[:, :2])[:, 1]
    y_true = m_dev[:, 2]
    s = mean_squared_error(y_true, y_pred, squared=False)
    scores.append(s)
    if s > main_model[1]:
        main_model = [m, s]
    
scores 
main_model[1]

0.5268636762326032

In [129]:
mm_model = main_model[0]
y_pred = m.predict_proba(m_test[:, :2])[:, 1]
y_true = m_test[:, 2]
s = mean_squared_error(y_true, y_pred, squared=False)
s

0.5279659079902792

## Gradient Descent

Parameters to tune include the regularizer weight, and gradient descent step

In [95]:
from sklearn.linear_model import LogisticRegression

In [96]:
class LogRegHParam:
    def __init__(self, f_interp=True, reg=None, reg_weight=0):
        self.f_int          = f_interp
        self.regularizer    = reg 
        self.regular_weight = reg_weight


In [97]:
penalties = ['l1', 'l2', 'elasticnet']
interp = [True, False]
cs = [0.0001, 0.001, 0.01, 1]
h_params = []
for p in penalties:
    for c in cs:
        for i in interp:
            ob = LogRegHParam(i, p, c)
            h_params.append(ob)

In [98]:
models = [LogisticRegression(penalty=p.regularizer, fit_intercept=p.f_int, C=p.regular_weight,  solver='saga', max_iter=1000) for p in h_params]

In [99]:
models = [LogisticRegression(penalty='l2', fit_intercept=p.f_int, C=p.regular_weight) for p in h_params]

In [100]:
for m in models:
    m.fit(m_train[:, :2], m_train[:, 2]) 

In [130]:
scores = []
main_model = [models[1], 0]
for m in models:
    main_model[1] = main_model[0].score(m_dev[:, :2], m_dev[:, 2])
    y_pred = m.predict_proba(m_dev[:, :2])[:, 1]
    y_true = m_dev[:, 2]
    s = mean_squared_error(y_true, y_pred, squared=False)
    scores.append(s)
    if s > main_model[1]:
        main_model = [m, s]
    
scores 
main_model[1]

0.5268636762326032

In [131]:
mm_model = main_model[0]
y_pred = m.predict_proba(m_test[:, :2])[:, 1]
y_true = m_test[:, 2]
s = mean_squared_error(y_true, y_pred, squared=False)
s

0.5279659079902792

## Decision Trees

In [103]:
from sklearn import tree


In [104]:
depths_split = [2, 4, 8]
models = []
for d in depths_split:
    clf = tree.DecisionTreeClassifier(min_samples_split=d)
    models.append(clf)

In [105]:
for m in models:
    m.fit(m_train[:, :2], m_train[:, 2])

In [132]:
scores = []
main_model = [models[1], 0]
for m in models:
    main_model[1] = main_model[0].score(m_dev[:, :2], m_dev[:, 2])
    y_pred = m.predict_proba(m_dev[:, :2])[:, 1]
    y_true = m_dev[:, 2]
    s = mean_squared_error(y_true, y_pred, squared=False)
    scores.append(s)
    if s > main_model[1]:
        main_model = [m, s]
    
scores 
main_model[1]

0.5268636762326032

In [133]:
mm_model = main_model[0]
y_pred = m.predict_proba(m_test[:, :2])[:, 1]
y_true = m_test[:, 2]
s = mean_squared_error(y_true, y_pred, squared=False)
s

0.5279659079902792

## Support Vector Machines

In [108]:
from sklearn import svm 
clf = svm.SVC()
clf.fit(m_train[:, :2], m_train[:, 2])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [109]:
clf.score(m_test[:, :2], m_test[:, 2])

0.5444

In [110]:
linear_clf = svm.LinearSVC()
clf.fit(m_train[:, :2], m_train[:, 2])

KeyboardInterrupt: 

In [None]:
clf.score(m_test[:, :2], m_test[:, 2])

## Neural networks

In [None]:
from sklearn.neural_network import MLPClassifier 

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [None]:
clf.fit(m_train[:, :2], m_train[:, 2])