In [2]:
import json
import gzip
import math
from collections import defaultdict
import numpy
from sklearn import linear_model
import random
import statistics
from math import inf

In [3]:
z = gzip.open("train.json.gz")

dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

z.close()

In [4]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
answers = {}

In [6]:
dataset[0]

{'hours': 0.3,
 'gameID': 'g35322304',
 'hours_transformed': 0.37851162325372983,
 'early_access': False,
 'date': '2015-04-08',
 'text': '+1',
 'userID': 'u55351001'}

# MSE, MAE 

In [7]:
def MSE(y, ypred):
    differences = [(x-y)**2 for x,y in zip(y,ypred)]
    return sum(differences) / len(differences)   
    
def MAE(y, ypred):
    differences = [abs(x-y) for x,y in zip(y,ypred)]
    return sum(differences) / len(differences)   

In [None]:
### Question 1

In [8]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])
    
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [9]:
def feat1(datum):
    feat = [datum['hours']]
    return [1] + feat

In [10]:
X = [feat1(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [11]:
X[0]

[1, 0.3]

In [12]:
y[0]

2

In [13]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [14]:
predictions[0]

390.440655847015

In [15]:
theta = mod.coef_
theta_1 = theta[1]
theta_1

0.007857269704335923

In [16]:
mse_q1 = MSE(y, predictions)
mse_q1

570936.2842458936

In [17]:
answers['Q1'] = [theta_1, mse_q1]

In [47]:
assertFloatList(answers['Q1'], 2)

In [None]:
### Question 2

In [18]:
hours_list = [d['hours'] for d in dataset]
median_hours = numpy.median(hours_list)

In [19]:
median_hours

10.1

In [20]:
def feat2(datum):
    offset = [1]
    feat = [datum['hours']] + [datum['hours_transformed']] + [datum['hours']**0.5] + [1 if datum['hours'] > median_hours else 0]
    return offset + feat

In [21]:
X = [feat2(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [22]:
X[0]

[1, 0.3, 0.37851162325372983, 0.5477225575051661, 0]

In [23]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [24]:
predictions[0]

240.83639117239971

In [25]:
mse_q2 = MSE(y, predictions)
mse_q2

565419.5340402235

In [26]:
answers['Q2'] = mse_q2

In [27]:
assertFloat(answers['Q2'])

In [28]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936], 'Q2': 565419.5340402235}

In [None]:
### Question 3

In [29]:
def feat3(datum):
    offset = [1]
    feat = [1 if datum['hours'] > 1 else 0] + [1 if datum['hours'] > 5 else 0] + [1 if datum['hours'] > 10 else 0] + [1 if datum['hours'] > 100 else 0] + [1 if datum['hours'] > 1000 else 0]
    return offset + feat   

In [30]:
X = [feat3(d) for d in dataset]

In [31]:
X[0]

[1, 0, 0, 0, 0, 0]

In [32]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [33]:
mse_q3 = MSE(y, predictions)
mse_q3

565405.4395885813

In [34]:
answers['Q3'] = mse_q3

In [35]:
assertFloat(answers['Q3'])

In [36]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813}

In [None]:
### Question 4

In [37]:
def feat4(datum):
    offset = [1]
    feat = [len(datum['text'])]
    return offset + feat

In [38]:
X = [feat4(d) for d in dataset]
y = [d['hours']for d in dataset]

In [39]:
print(X[0])
print(y[0])

[1, 2]
0.3


In [40]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [41]:
predictions[0]

66.00278301332149

In [42]:
mse = MSE(y, predictions)
mae = MAE(y, predictions)

In [43]:
print(mse)
print(mae)

75735.70018273004
90.35613031985152


In [44]:
answers['Q4'] = [mse, mae, "I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE."]

In [45]:
assertFloatList(answers['Q4'][:2], 2)

In [46]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.']}

In [None]:
### Question 5

In [47]:
def feat5(datum):
    offset = [1]
    feat = [len(datum['text'])]
    return offset + feat

In [48]:
X = [feat5(d) for d in dataset]
y_trans = [d['hours_transformed'] for d in dataset]

In [49]:
print(X[0])
print(y_trans[0])

[1, 2]
0.37851162325372983


In [50]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [51]:
predictions_trans[0]

3.63648953426789

In [52]:
mse_trans = MSE(y_trans, predictions_trans)# MSE using the transformed variable
mse_trans

5.255254235328278

In [53]:
predictions_untrans = [2**p - 1 for p in predictions_trans] # Undoing the transformation

In [54]:
predictions_untrans[0]

11.436335473298154

In [55]:
y_untrans = [d['hours']for d in dataset]

In [56]:
y_untrans[0]

0.3

In [57]:
mse_untrans = MSE(y_untrans, predictions_untrans)

In [58]:
answers['Q5'] = [mse_trans, mse_untrans]

In [59]:
assertFloatList(answers['Q5'], 2)

In [60]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818]}

In [61]:
### Question 6

In [62]:
def feat6(datum):
    feat = [0]*100 
    offset = [1]
    hours_played = int(datum['hours'])  # Floor the number of hours played
    if hours_played >= 100:
        feat[-1] = 1  # If hours played is 99 or more, set the last element to 1
    else:
        feat[hours_played] = 1  # Otherwise, set the corresponding element to 1
    return offset + feat

In [63]:
X = [feat6(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [64]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [65]:
models = {}
mses = {}
bestC = None
best_mse_valid = inf

for c in [1, 10, 100, 1000, 10000]:
    # Create and fit the Ridge regression model
    model = linear_model.Ridge(alpha=c)
    model.fit(Xtrain, ytrain)
    
    # Store the model
    models[c] = model
    
    # Predict on the validation set
    predictions_valid = model.predict(Xvalid)
    
    # Compute the MSE on the validation set
    mse_valid = MSE(yvalid, predictions_valid)
    mses[c] = mse_valid
    if mse_valid < best_mse_valid:
        bestC = c
        best_mse_valid = mse_valid

In [66]:
print(best_mse_valid)
print(bestC)

581432.8208480775
1000


In [67]:
best_model = linear_model.Ridge(alpha=bestC)
best_model.fit(Xtrain, ytrain)

In [68]:
predictions_test = best_model.predict(Xtest)

In [69]:
mse_valid = best_mse_valid
mse_valid

581432.8208480775

In [70]:
mse_test = MSE(ytest, predictions_test)
mse_test

560786.7645482323

In [71]:
answers['Q6'] = [bestC, mse_valid, mse_test]

In [72]:
assertFloatList(answers['Q6'], 3)

In [73]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323]}

In [74]:
### Question 7

In [75]:
times = [d['hours_transformed'] for d in dataset]
median = statistics.median(times)

In [76]:
count_less_than_one_hour = 0
for d in dataset:
    if d['hours'] < 1:  
        count_less_than_one_hour += 1  
# notPlayed = 
nNotPlayed = count_less_than_one_hour

In [77]:
answers['Q7'] = [median, nNotPlayed]

In [78]:
assertFloatList(answers['Q7'], 2)

In [79]:
answers['Q7']

[3.4724877714627436, 19913]

In [80]:
### Question 8

In [81]:
def feat8(datum):
    offset = [1]
    feat = [len(datum['text'])]
    return offset + feat

In [82]:
X = [feat8(d) for d in dataset]
y = [d['hours_transformed'] > median for d in dataset]

In [83]:
print(X[0])
print(y[0])

[1, 2]
False


In [84]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X,y)
predictions = mod.predict(X) # Binary vector of predictions

In [85]:
def rates(predictions, y):
    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    TN = sum([(not p and not l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and not l) for (p,l) in zip(predictions, y)])
    FN = sum([(not p and l) for (p,l) in zip(predictions, y)])    
    return TP, TN, FP, FN

In [86]:
TP, TN, FP, FN = rates(predictions, y)

In [87]:
TP, FN, TN, FP

(24656, 62526, 67811, 20007)

In [88]:
TPR = TP/ (TP+FN)
TNR = TN/ (TN+FP)

In [89]:
print(TPR)
print(TNR)

0.2828106719276915
0.7721765469493725


In [90]:
BER = 1 - 1/2 * (TPR + TNR)

In [91]:
answers['Q8'] = [TP, TN, FP, FN, BER]

In [92]:
assertFloatList(answers['Q8'], 5)

In [93]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679]}

In [None]:
### Question 9

In [94]:
print(X[0])
print(y[0])

[1, 2]
False


In [95]:
scores = mod.decision_function(X)

In [96]:
scoreslabels = list(zip(scores, y))

In [97]:
scoreslabels.sort(reverse=True)

In [98]:
scoreslabels[:15]

[(1.477131385394358, True),
 (1.477131385394358, True),
 (1.477131385394358, True),
 (1.477131385394358, True),
 (1.477131385394358, True),
 (1.477131385394358, True),
 (1.477131385394358, False),
 (1.477131385394358, False),
 (1.477131385394358, False),
 (1.477131385394358, False),
 (1.477131385394358, False),
 (1.4769373981573968, True),
 (1.4769373981573968, True),
 (1.4769373981573968, True),
 (1.4769373981573968, True)]

In [None]:
precision = 
recall = 

In [100]:
precs = []
recs = []

for k in [5, 10, 100, 1000]:
    # Find the threshold, which is the score at position k-1 because of zero indexing
    threshold = scoreslabels[k-1][0] if k <= len(scoreslabels) else scoreslabels[-1][0]
    
    # Extend k to include all scores equal to the threshold score
    k_extended = k
    while k_extended < len(scoreslabels) and scoreslabels[k_extended][0] == threshold:
        k_extended += 1
    
    # Count the true positives in the top k_extended scores
    true_positives_at_k = sum(1 for score, label in scoreslabels[:k_extended] if label)
    
    # Calculate precision at k_extended
    precision_at_k = true_positives_at_k / k_extended
    precs.append(precision_at_k)

In [101]:
precs

[0.5454545454545454, 0.5454545454545454, 0.67, 0.685]

In [102]:
answers['Q9'] = precs

In [103]:
assertFloatList(answers['Q9'], 4)

In [104]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685]}

In [105]:
### Question 10

In [106]:
def calculate_ber(TP, TN, FP, FN):
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    return 1 - 0.5 * (TPR + TNR)

In [107]:
def feat10(datum):
    offset = [1]
    feat = [len(datum['text'])]
    return offset + feat

In [108]:
X = [feat10(d) for d in dataset]
y_trans = [d['hours_transformed'] for d in dataset]
y_compare_med = [d['hours_transformed'] > median for d in dataset]

In [109]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [110]:
print(min(predictions_trans))
print(max(predictions_trans))

3.6360712121539693
5.3093596678374935


In [111]:
best_threshold = None
best_ber = inf

# Search for the best threshold
threshold_values = numpy.linspace(min(predictions_trans), max(predictions_trans), 100)
for threshold in threshold_values:
    predictions_thresh = [p > threshold for p in predictions_trans]
    TP, TN, FP, FN = rates(predictions_thresh, y_compare_med)
    ber = calculate_ber(TP, TN, FP, FN)
    if ber < best_ber:
        best_ber = ber
        best_threshold = threshold

In [112]:
your_threshold = best_threshold
BER = best_ber

In [113]:
print(your_threshold)
print(BER)

3.703678826525021
0.47153822574126236


In [114]:
answers['Q10'] = [your_threshold, BER]

In [115]:
assertFloatList(answers['Q10'], 2)

In [116]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.703678826525021, 0.47153822574126236]}

In [117]:
### Question 11

In [118]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [119]:
userMedian = defaultdict(list)
itemMedian = defaultdict(list)

# Compute medians on training data
for d in dataTrain:
    user, item = d['userID'], d['gameID']
    userMedian[user].append(d['hours_transformed'])
    itemMedian[item].append(d['hours_transformed'])
    
# Calculate the median play times
for user in userMedian:
    userMedian[user] = statistics.median(userMedian[user])
for item in itemMedian:
    itemMedian[item] = statistics.median(itemMedian[item])


In [120]:
answers['Q11'] = [itemMedian['g35322304'], userMedian['u55351001']]

In [121]:
assertFloatList(answers['Q11'], 2)

In [122]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.703678826525021, 0.47153822574126236],
 'Q11': [0.58496250072

In [123]:
### Question 12

In [124]:
times = [d['hours_transformed'] for d in dataset]
globalMedian = statistics.median(times)

In [125]:
   def f12(u, i):
    # Function returns a single value (0 or 1)
    # Check if item's median playtime is above global median
    if i in itemMedian and itemMedian[i] > globalMedian:
        return 1
    # If the item hasn't been seen before, check if the user's median playtime is above the global median
    elif i not in itemMedian and userMedian.get(u, 0) > globalMedian:
        return 1
    else:
        return 0

In [126]:
preds = [f12(d['userID'], d['gameID']) for d in dataTest]

In [127]:
# Generate true labels based on whether the actual time played is above the global median or not
y = [1 if d['hours_transformed'] > globalMedian else 0 for d in dataTest]

In [128]:
accuracy = sum([pred == true for pred, true in zip(preds, y)]) / len(y)

In [129]:
answers['Q12'] = accuracy

In [130]:
assertFloat(answers['Q12'])

In [131]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.703678826525021, 0.47153822574126236],
 'Q11': [0.58496250072

In [132]:
### Question 13

In [133]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}

for d in dataset:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [134]:
def Jaccard(s1, s2):   
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [135]:
def mostSimilar(i, func, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [136]:
ms = mostSimilar(dataset[0]['gameID'], Jaccard, 10)

In [137]:
answers['Q13'] = [ms[0][0], ms[-1][0]]

In [138]:
assertFloatList(answers['Q13'], 2)

In [139]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.703678826525021, 0.47153822574126236],
 'Q11': [0.58496250072

In [140]:
### Question 14

In [141]:
times = [d['hours_transformed'] for d in dataset]
globalMedian = statistics.median(times)

In [142]:
def mostSimilar14(i, func, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(i, i2)
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [143]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = 1 if d['hours_transformed'] > globalMedian else -1 # Set the label based on a rule
    ratingDict[(u,i)] = lab

In [144]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = sum(ratingDict[(u, i1)] * ratingDict[(u, i2)] for u in inter)
    denom1 = sum(ratingDict[(u, i1)]**2 for u in usersPerItem[i1])
    denom2 = sum(ratingDict[(u, i2)]**2 for u in usersPerItem[i2])
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [145]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)

In [146]:
answers['Q14'] = [ms[0][0], ms[-1][0]]

In [147]:
assertFloatList(answers['Q14'], 2)

In [148]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.703678826525021, 0.47153822574126236],
 'Q11': [0.58496250072

In [149]:
### Question 15

In [150]:
def mostSimilar15(i, func, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(i, i2)
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [151]:
ratingDict = {}
for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = d['hours_transformed']# Set the label based on a rule
    ratingDict[(u,i)] = lab

In [152]:
ms = mostSimilar15(dataset[0]['gameID'], Cosine, 10)

In [153]:
answers['Q15'] = [ms[0][0], ms[-1][0]]

In [154]:
assertFloatList(answers['Q15'], 2)

In [155]:
answers

{'Q1': [0.007857269704335923, 570936.2842458936],
 'Q2': 565419.5340402235,
 'Q3': 565405.4395885813,
 'Q4': [75735.70018273004,
  90.35613031985152,
  'I think MAE is the suitable indicator here, by taking the absolute value of errors, treats all deviations from the true values equally. It is thus more robust to outliers compared to MSE, which squares the errors, giving disproportionately high weight to large deviations (outliers). Given that we have widely varying lengths in dataset, the chances are that we might also have some outliers. In such a scenario, we would not want these outliers to distort your error metric excessively, which could happen with MSE.'],
 'Q5': [5.255254235328278, 78668.56502956818],
 'Q6': [1000, 581432.8208480775, 560786.7645482323],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.703678826525021, 0.47153822574126236],
 'Q11': [0.58496250072