In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import numpy as np

In [2]:
import homework3

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [5]:
def countRight(a,b,epsilon):
    if len(a) != len(b):
        print("It looks like your solution has the wrong length (got " + str(len(a)) + ", expected "
 + str(len(b)) + ")")
        return 0
    a_ = np.array(a).flatten()
    b_ = np.array(b).flatten()
    right = np.abs(a_ - b_) < epsilon
    return float(sum(right) / len(right))

In [6]:
# Some data structures that will be useful

In [7]:
allRatings = []
for l in readCSV("./../datasets/assignment1/train_Interactions.csv.gz"):
    allRatings.append(l)

In [8]:
len(allRatings)

200000

In [9]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [10]:
##################################################
# Rating prediction                              #
##################################################

In [11]:
trainRatings = [r[2] for r in ratingsTrain]
globalAverage = homework3.getGlobalAverage(trainRatings)

In [12]:
def testQ1():
    ga = homework3.getGlobalAverage(trainRatings)

    trivialValidMSE = homework3.trivialValidMSE(ratingsValid, ga)
    
    print("average = " + str(ga))
    print("validation MSE = " + str(trivialValidMSE))

In [13]:
testQ1()

average = 3.6868052631578947
validation MSE = 1.680211317922438


In [14]:
def iterateN(which, alpha, betaU, betaI, lamb, N):
    for i in range(N):
        alpha = which.alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb)
        betaU = which.betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb)
        betaI = which.betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb)
        mse, mseReg = which.msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb)
        print("Iteration " + str(i + 1))
        print("  MSE = " + str(mse))
        print("  regularized objective = " + str(mseReg))
    return alpha, betaU, betaI, mse, mseReg

In [15]:
def testModel(which):
    betaU = {}
    betaI = {}
    for u in ratingsPerUser:
        betaU[u] = 0

    for b in ratingsPerItem:
        betaI[b] = 0

    alpha = globalAverage # Could initialize anywhere, this is a guess
    
    alpha, betaU, betaI, mse, mseReg = iterateN(which, alpha, betaU, betaI, 1.0, 1)
    validMSE = which.validMSE(ratingsValid, alpha, betaU, betaI)
    
    return alpha, betaU, betaI, mse, mseReg, validMSE

In [16]:
def testQ2():
    alpha, betaU, betaI, mse, mseReg, validMSE = testModel(homework3)
    print("validMSE = " + str(validMSE))

In [17]:
testQ2()

Iteration 1
  MSE = 1.0727154704480888
  regularized objective = 13249.82842908561
validMSE = 1.440670105511255


In [18]:
def testQ3():
    betaU = {}
    betaI = {}
    for u in ratingsPerUser:
        betaU[u] = 0

    for b in ratingsPerItem:
        betaI[b] = 0

    alpha = globalAverage # Could initialize anywhere, this is a guess
    
    alpha, betaU, betaI = homework3.goodModel(ratingsTrain, ratingsPerUser, ratingsPerItem, alpha, betaU, betaI)
    validMSE = homework3.validMSE(ratingsValid, alpha, betaU, betaI)
    
    print("validMSE = " + str(validMSE))

In [19]:
testQ3()

validMSE = 1.43487779932601


In [20]:
##################################################
# Read prediction                                #
##################################################

In [21]:
# From baseline code
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("./../datasets/assignment1/train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

In [22]:
def testQ4():
    readValid, notRead = homework3.generateValidation(allRatings, ratingsValid)
    print("Should be equal: " + str((len(readValid), len(notRead), len(ratingsValid))))

In [23]:
testQ4()

Should be equal: (10000, 10000, 10000)


In [24]:
def testQ5():
    return1 = homework3.baseLineStrategy(mostPopular, totalRead)
    better = homework3.improvedStrategy(mostPopular, totalRead)
    
    readValid, notRead = homework3.generateValidation(allRatings, ratingsValid)
    
    correctA = homework3.evaluateStrategy(return1, readValid, notRead)
    correctB = homework3.evaluateStrategy(better, readValid, notRead)
    
    print("Accuracy (simple strategy) = " + str(correctA))
    print("Accuracy (better strategy) = " + str(correctB))

In [25]:
testQ5()

Accuracy (simple strategy) = 0.7115
Accuracy (better strategy) = 0.7375


In [26]:
def testQ6():
    readValid, notRead = homework3.generateValidation(allRatings, ratingsValid)
    
    for (u,b) in list(readValid)[:20] + list(notRead)[:20]:
        a = homework3.jaccardThresh(u,b,ratingsPerItem,ratingsPerUser)
        print("Jaccard-based predictor for " + str((u,b)) + " = " + str(a))

    # This is slow (so the autograder doesn't run it) but you should run it at home once you have a good solution
    # homework3.writePredictionsRead(ratingsPerItem, ratingsPerUser)

In [27]:
testQ6()

Jaccard-based predictor for ('u14474538', 'b26940296') = 1
Jaccard-based predictor for ('u58019057', 'b42920311') = 1
Jaccard-based predictor for ('u41520063', 'b10032964') = 1
Jaccard-based predictor for ('u42166306', 'b12204510') = 0
Jaccard-based predictor for ('u33364550', 'b88805343') = 1
Jaccard-based predictor for ('u64867256', 'b62292824') = 1
Jaccard-based predictor for ('u14921259', 'b05819688') = 1
Jaccard-based predictor for ('u87799363', 'b86214899') = 0
Jaccard-based predictor for ('u75351019', 'b66664015') = 1
Jaccard-based predictor for ('u85235376', 'b87202282') = 0
Jaccard-based predictor for ('u36945000', 'b54695184') = 1
Jaccard-based predictor for ('u45431809', 'b08545824') = 1
Jaccard-based predictor for ('u59074406', 'b82853645') = 1
Jaccard-based predictor for ('u17931540', 'b00781753') = 1
Jaccard-based predictor for ('u16619308', 'b55847701') = 1
Jaccard-based predictor for ('u36649549', 'b78008834') = 1
Jaccard-based predictor for ('u00120164', 'b59770276') =

In [28]:
##################################################
# Category prediction                            #
##################################################

In [29]:
data = []

for d in readGz("../datasets/assignment1/train_Category.json.gz"):
    data.append(d)
    # Just use a little data to make things faster...
    if len(data) > 10000:
        break

In [30]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [31]:
NW = 500 # dictionary size

In [32]:
words = [x[1] for x in counts[:NW]]

In [33]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [34]:
def testQ7():
    f1 = homework3.featureCat(data[0], words, wordId, wordSet)
    
    print("Feature vector = " + str(f1))

In [35]:
testQ7()

Feature vector = [0, 2, 2, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [36]:
def testQ8():
    X = [homework3.featureCat(d, words, wordId, wordSet) for d in data]
    y = [d['genreID'] for d in data]
    
    Xtrain = X[:9*len(X)//10]
    ytrain = y[:9*len(y)//10]
    Xvalid = X[9*len(X)//10:]
    yvalid = y[9*len(y)//10:]
    
    mod = linear_model.LogisticRegression(C=1)
    mod.fit(Xtrain, ytrain)
    pred = mod.predict(Xvalid)
    correctA = pred == yvalid
    correctA = sum(correctA) / len(correctA)
    
    X = homework3.betterFeatures(data)
    Xtrain = X[:9*len(X)//10]
    Xvalid = X[9*len(X)//10:]
    
    mod = linear_model.LogisticRegression(C=1)
    mod.fit(Xtrain, ytrain)
    pred = mod.predict(Xvalid)
    correctB = pred == yvalid
    correctB = sum(correctB) / len(correctB)
    
    sc = correctA < (correctB * 0.99)

    data_test = []
    for d in readGz("../datasets/assignment1/test_Category.json.gz"):
        data_test.append(d)
    
    Xtest = homework3.betterFeatures(data_test)
    pred_test = mod.predict(Xtest)
    
    homework3.writePredictionsCategory(pred_test)
    
    if sc:
        print("Looks like your solution is better")
    else:
        print("Looks like your solution is not better")

In [37]:
testQ8()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Looks like your solution is better
