In [126]:
import json
import gzip
import math
from collections import defaultdict
import numpy as np
from sklearn import linear_model
import random
import statistics
from math import log2
from sklearn.metrics import confusion_matrix
from numpy.linalg import norm
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
answers = {}

In [4]:
z = gzip.open("train.json.gz")

In [5]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [6]:
z.close()

In [7]:
### Question 1

In [8]:
def MSE(y, ypred):
    diff = [ (x-y)**2 for x,y in zip(y,ypred)]
    return sum(diff)/len(diff)

In [9]:
def MAE(y, ypred):
    return np.mean(np.abs(ypred - y))

In [10]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])
    
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [11]:
dataset[1]

{'userID': 'u70666506',
 'early_access': False,
 'hours': 63.5,
 'hours_transformed': 6.011227255423254,
 'found_funny': 1,
 'text': 'If you want to sit in queue for 10-20min and have 140 ping then this game is perfect for you :)',
 'gameID': 'g49368897',
 'user_id': '76561198030408772',
 'date': '2017-05-20'}

In [12]:
def feat1(d):
    hours = [d['hours']]
    feat = [1] + hours
    return feat 

In [13]:
X = [feat1(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [14]:
theta_1,residuals_1,rank_1,s_1 = np.linalg.lstsq(X,y)

  theta_1,residuals_1,rank_1,s_1 = np.linalg.lstsq(X,y)


In [15]:
theta_1 = theta_1[1]

In [16]:
mse_q1 = residuals_1[0]/ len(y)
print(mse_q1)

570936.2842458996


In [17]:
answers['Q1'] = [theta_1, mse_q1]

In [18]:
assertFloatList(answers['Q1'], 2)

In [19]:
### Question 2

In [20]:
def feat2(d):
    hours = [d['hours']]
    hoursmedian = [ 1 if i > np.median(hours) else 0 for i in hours ]
    feat = hours + [log2(i+1) for i in hours] + [i**(1/2) for i in hours] + hoursmedian
    return [1]+feat

In [21]:
X = [feat2(d) for d in dataset]

In [22]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [23]:
mse_q2 = MSE(y, predictions)

In [24]:
answers['Q2'] = mse_q2

In [25]:
assertFloat(answers['Q2'])

In [26]:
### Question 3

In [27]:
def feat3(d):
    hours = [d['hours']]
    h1    = [ 1 if i > 1 else 0 for i in hours ]
    h5    = [ 1 if i > 5 else 0 for i in hours ]
    h10   = [ 1 if i > 10 else 0 for i in hours ]
    h100  = [ 1 if i > 100 else 0 for i in hours ]
    h1000 = [ 1 if i > 1000 else 0 for i in hours ]
    
    
    feat = hours + h1 + h5 + h10 + h100 + h1000
    return [1] + feat 

In [28]:
X = [feat3(d) for d in dataset]

In [29]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [30]:
mse_q3 = MSE(y, predictions)

In [31]:
answers['Q3'] = mse_q3

In [32]:
assertFloat(answers['Q3'])

In [33]:
### Question 4

In [34]:
def feat4(d):
    length = [len(d['text'])]
    return [1]+ length

In [35]:
X = [feat4(d) for d in dataset]
y = [d['hours'] for d in dataset]

In [36]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [37]:
mse = MSE(y,predictions)
mae = MAE(y, predictions)

In [38]:
print(mse, mae)

75735.70018272995 90.35613031984947


In [39]:
answers['Q4'] = [mse, mae, "mae is more suitable since mse gets more affected with outliers and mae is more robust"]

In [40]:
assertFloatList(answers['Q4'][:2], 2)

In [41]:
### Question 5

In [42]:
y_trans = [math.log2(d['hours'] + 1) for d in dataset]

In [43]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [44]:
mse_trans = MSE(y_trans, predictions_trans)# MSE using the transformed variable

In [45]:
print(mse_trans)

5.255254235328321


In [46]:
y = [(2**(y)- 1 ) for y in y_trans]

In [47]:
mod.fit(X,y)
predictions_untrans = mod.predict(X) # Undoing the transformation

In [48]:
mse_untrans = MSE(predictions_untrans, y)
print(mse_untrans)

75735.70018272997


In [49]:
answers['Q5'] = [mse_trans, mse_untrans]

In [50]:
assertFloatList(answers['Q5'], 2)

In [51]:
### Question 6

In [52]:
def feat6(d):
    hours = int(d['hours'])
    feat = [0] * 100
    if hours < 1:
        feat[0] = 1
    elif hours >= 99:
        feat[99] = 1
    else:
        feat[hours] = 1
    return [1] + feat   

In [53]:
X = [feat6(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [54]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [55]:
models = {}
mses = {}
bestC = None


for c in [1, 10, 100, 1000, 10000]:
    mod = linear_model.Ridge(alpha=c)
    mod.fit(Xtrain,ytrain)
    predictions = mod.predict(Xvalid)
    models[c] = mod
    mse = MSE(predictions, yvalid)
    mses[c]= mse
    print(mse)
    

581950.237335945
581883.0733828805
581546.2387398417
581432.8208480775
583607.563450728


In [56]:
bestC = min(mses, key=mses.get)
print(bestC)

1000


In [57]:
predictions = models[bestC].predict(Xvalid)
predictions_test = models[bestC].predict(Xtest)

In [58]:
mse_valid = MSE(yvalid, predictions)
print(mse_valid)

581432.8208480775


In [59]:
mse_test = MSE(ytest , predictions_test)
print(mse_test)

560786.7645482322


In [60]:
answers['Q6'] = [bestC, mse_valid, mse_test]

In [61]:
assertFloatList(answers['Q6'], 3)

In [62]:
### Question 7

In [63]:
times = [d['hours_transformed'] for d in dataset]
median = statistics.median(times)

In [64]:
notPlayed = [ 1 if d['hours'] < 1 else 0 for d in dataset ]
nNotPlayed = sum(notPlayed)

In [65]:
answers['Q7'] = [median, nNotPlayed]

In [66]:
assertFloatList(answers['Q7'], 2)

In [67]:
### Question 8

In [68]:
def feat8(d):
    length = [len(d['text'])]
    return [1] + length

In [69]:
X = [feat8(d) for d in dataset]
y = [d['hours_transformed'] > median for d in dataset]

In [70]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X,y)
predictions = mod.predict(X) # Binary vector of predictions

In [71]:
def rates(predictions, y):
    TP = [a and b for (a,b) in zip(predictions,y)]
    TN = [not a and not b for (a,b) in zip(predictions,y)]
    FP = [a and not b for (a,b) in zip(predictions,y)]
    FN = [not a and b for (a,b) in zip(predictions,y)]

    TP = sum(TP)
    TN = sum(TN)
    FP = sum(FP)
    FN = sum(FN)
    
    return TP, TN, FP, FN

In [72]:
TP, TN, FP, FN = rates(predictions, y)

In [73]:
BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))
print(BER)

0.472506390561468


In [74]:
BER8 = BER

In [75]:
answers['Q8'] = [TP, TN, FP, FN, BER]

In [76]:
assertFloatList(answers['Q8'], 5)

In [77]:
### Question 9

In [78]:
prob = mod.predict_proba(X)
sortedprob = sorted([(p[1], y_val) for p, y_val in zip(prob, y)], key=lambda x: x[0], reverse=True)

In [79]:
#precision = 
#recall = 

In [80]:
precs = []
recs = []

for i in [5, 10, 100, 1000]:
    threshold = sortedprob[i - 1][0]
    last_tied = i
    while last_tied < len(sortedprob) and sortedprob[last_tied][0] == threshold:
        last_tied += 1
    
    k = last_tied
    top_pred = [1 if prob >= threshold else 0 for prob, _ in sortedprob[:k]]
    TP = sum(y for _,y in sortedprob[:k] if y == 1)
        
        
    if TP + (k - TP) ==0:
        precision = 0
    else :
        precision = (TP) / (TP + (k - TP))
    precs.append(precision)
precs

[0.5454545454545454, 0.5454545454545454, 0.67, 0.685]

In [81]:
answers['Q9'] = precs

In [82]:
assertFloatList(answers['Q9'], 4)

In [83]:
### Question 10

In [84]:
y_trans = [d['hours_transformed'] for d in dataset]

In [85]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [86]:
threshold_min = min(y_trans)  
threshold_max = max(y_trans)  

num_thresholds = 100

In [87]:
best_threshold = None
min_BER = BER8  # Initialize with a large value

for threshold in np.linspace(threshold_min, threshold_max, num_thresholds):
    predictions_thresh = [y >= threshold for y in y_trans]
    TP, TN, FP, FN = rates(predictions_thresh, y)
    BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))


    if BER < min_BER:
        min_BER = BER
        best_threshold = threshold

In [88]:
print(best_threshold, min_BER)

3.538825786381682 0.011556284554151086


In [89]:
predictions_thresh =[d['hours_transformed'] > best_threshold for d in dataset] # Using a fixed threshold to make predictions

In [90]:
TP, TN, FP, FN = rates(predictions_thresh, y)

In [91]:
BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))
print(BER)

0.011556284554151086


In [93]:
your_threshold = best_threshold

In [94]:
answers['Q10'] = [your_threshold, BER]

In [95]:
assertFloatList(answers['Q10'], 2)

In [96]:
### Question 11

In [97]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [98]:
dataTrain[0]

{'hours': 0.3,
 'gameID': 'g35322304',
 'hours_transformed': 0.37851162325372983,
 'early_access': False,
 'date': '2015-04-08',
 'text': '+1',
 'userID': 'u55351001'}

In [99]:
userMedian = defaultdict(list)
itemMedian = defaultdict(list)

# Compute medians on training data
for d in dataTrain : 
    itemMedian[d['gameID']].append(d['hours'])
    userMedian[d['userID']].append(d['hours'])
    
for i in itemMedian : 
    itemMedian[i] = np.median(itemMedian[i])
    
for i in userMedian :
    userMedian[i] = np.median(userMedian[i])

In [100]:
itemMedian['g35322304']

0.5

In [101]:
userMedian['u55351001']

3.9

In [102]:
answers['Q11'] = [itemMedian['g35322304'], userMedian['u55351001']]

In [103]:
assertFloatList(answers['Q11'], 2)

In [104]:
### Question 12

In [105]:
globalMedian = [d['hours'] for d in dataset]
globalMedian = np.median(globalMedian)

In [106]:
def f12(u,i):
    if itemMedian[i] > globalMedian:
        r = 1
    elif itemMedian[i] == None and userMedian[i] > globalMedian:
        r = 1
    else:
        r = 0
    return [r]

In [107]:
preds = [f12(d['userID'], d['gameID']) for d in dataTest]
print(preds[:5])

[[0], [1], [0], [1], [0]]


In [108]:
y = [d['hours'] for d in dataTest]
y = [ 1 if i > globalMedian else 0 for i in y ]
print(y[:5])

[0, 0, 1, 0, 0]


In [127]:
#TP, TN, FP, FN = rates(preds, y)
accuracy = accuracy_score(preds,y)

In [128]:
print(accuracy)

0.7410857142857142


In [129]:
answers['Q12'] = accuracy

In [130]:
assertFloat(answers['Q12'])

In [113]:
### Question 13

In [114]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}

for d in dataset:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [115]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom
    

In [116]:
def mostSimilar(i, func, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]
    

In [117]:
ms = mostSimilar(dataset[0]['gameID'], Jaccard, 10)

In [118]:
print(ms)


[(0.07988165680473373, 'g38617933'), (0.07547169811320754, 'g15217706'), (0.0657439446366782, 'g52947714'), (0.06451612903225806, 'g97650885'), (0.05588235294117647, 'g61264440'), (0.05405405405405406, 'g34481494'), (0.05121951219512195, 'g94300691'), (0.04966887417218543, 'g08001721'), (0.045346062052505964, 'g53996086'), (0.04390243902439024, 'g66220144')]


In [121]:
answers['Q13'] = [ms[0][0], ms[-1][0]]

In [122]:
assertFloatList(answers['Q13'], 2)

In [123]:
### Question 14

In [133]:
globalMedian = [d['hours'] for d in dataset]
globalMedian = np.median(globalMedian)

In [196]:
def mostSimilar14(i, func, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [197]:
ratingDict

{('u55351001', 'g35322304'): [-1],
 ('u70666506', 'g49368897'): [1],
 ('u18612571', 'g73495588'): [-1],
 ('u34283088', 'g68047320'): [1],
 ('u16220374', 'g51234623'): [-1],
 ('u01499286', 'g25723374'): [-1],
 ('u73063505', 'g58025004'): [1],
 ('u29223775', 'g69033010'): [1],
 ('u44401308', 'g46446145'): [1],
 ('u45027672', 'g02903254'): [-1],
 ('u33908704', 'g66086214'): [-1],
 ('u27998358', 'g21544048'): [-1],
 ('u36214177', 'g86787099'): [-1],
 ('u73747744', 'g23131507'): [1],
 ('u97936673', 'g65055990'): [-1],
 ('u25365202', 'g45124396'): [1],
 ('u08631099', 'g74380807'): [1],
 ('u52584928', 'g46738138'): [1],
 ('u09520763', 'g65420497'): [-1],
 ('u04893836', 'g76381409'): [-1],
 ('u58381940', 'g03420056'): [-1],
 ('u85007552', 'g13492762'): [-1],
 ('u48369340', 'g35678007'): [1],
 ('u79530461', 'g13492762'): [-1],
 ('u44157494', 'g28405264'): [-1],
 ('u70118164', 'g60903743'): [-1],
 ('u21352780', 'g92017077'): [1],
 ('u74354158', 'g98792097'): [1],
 ('u67890036', 'g11225866'): [-1

In [202]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = [1 if (d['hours'] > globalMedian) else -1] # Set the label based on a rule
    ratingDict[(u,i)] = lab
    usersPerItem[i].add(u)
    itemsPerUser[i].add(i)

In [205]:
def Cosine(i1, i2):
    u1 =usersPerItem[i1]
    u2 =usersPerItem[i2]
    
    inter = u1.intersection(u2)
    
    dot = sum(ratingDict[(u,i1)] * ratingDict[(u,i2)] for u in inter)
    
    norm1 = np.sqrt(sum(ratingDict[(u,i1)])**2 for u in u1)
    norm2 = np.sqrt(sum(ratingDict[(u,i1)])**2 for u in u2)
    if norm1 == 0 or norm2 == 0: 
        return 0
    else :
        return dot/ (norm1*norm2)

In [206]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)
print(ms)

TypeError: unhashable type: 'set'

In [194]:
print(ms[-1][0])

0.0


In [192]:
answers['Q14'] = [ms[0][0], ms[-1][0]]

In [193]:
assertFloatList(answers['Q14'], 2)

In [None]:
### Question 15

In [None]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = # Set the label based on a rule
    ratingDict[(u,i)] = lab

In [None]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)

In [None]:
answers['Q15'] = [ms[0][0], ms[-1][0]]

In [None]:
assertFloatList(answers['Q15'], 2)

In [195]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()