In [1]:
import gzip
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn import linear_model

def readJSON(path):
    for l in gzip.open(path, 'r+'):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d


In [2]:
data = [[u, g] for (u,g,_) in readJSON("train.json.gz")]
data2 = [[g, d] for (_,g,d) in readJSON("train.json.gz")]
#random.shuffle(data)
data_train = data[:165000]
data_valid = data[165000:]

In [3]:
len(data_valid)

10000

In [4]:
### Would-play baseline: just rank which games are popular and which are not, and return '1' if a game is among the top-ranked
gameCount = defaultdict(int)
totalPlayed = 0

for user,game in data: # data_train
    gameCount[game] += 1
    totalPlayed += 1

# baseline predictor ---
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()


In [5]:
game_time_played = defaultdict(float)
totalTimePlayed = 0
for game, d in data2:
    game_time_played[game] += d['hours_transformed']
    totalTimePlayed += d['hours_transformed']

mostPlayed = [(game_time_played[x], x) for x in game_time_played]
mostPlayed.sort()
mostPlayed.reverse()

In [6]:
print("Performance(accuracy) of the baseline model")

# --- Build the validation set
games_played = defaultdict(set)
users_played = defaultdict(set)
users = set()
games = set()
for user,game in data_train:  #data_train
    users.add(user)
    games.add(game)
    games_played[user].add(game)
    users_played[game].add(user)

valid_users_game_cnt = defaultdict(int)
for user, game in data_valid:
    valid_users_game_cnt[user] += 1

# Add entries users haven't played
for user in valid_users_game_cnt.keys():
    games_to_add = random.sample(games - games_played[user], valid_users_game_cnt[user])
    for game in games_to_add:
        data_valid.append([user, game])

y_valid = [1]*(len(data_valid)//2) + [0]*(len(data_valid)//2)
# ---
        
def find_accuracy(pred, y):
    correct = np.array(pred) == np.array(y)
    return sum(correct) / len(correct)


Performance(accuracy) of the baseline model


In [7]:
print("Finding a better threshold")

def find_predictor_set(percentile = 0.69):
    predictor_set = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        predictor_set.add(i)
        if count > totalPlayed*percentile: break
    return predictor_set


def find_most_played_set(percentile = 0.69):
    most_played_set = set()
    count = 0
    for ic, i in mostPlayed:
        count += ic
        most_played_set.add(i)
        if count > totalTimePlayed*percentile: break
    return most_played_set

Finding a better threshold


In [8]:
print("Compute the Jaccard similarities")

def my_similarity(s1, s2, alpha=1, beta=1):
    #Tversky index
    numer = len(s1.intersection(s2))
    denom = len(s1.intersection(s2)) + alpha*len(s1-s2) + beta*len(s2-s1) #Jaccard
    return numer / denom


# Add entries users haven't played
train_users_game_cnt = defaultdict(int)
for user, game in data_train:
    train_users_game_cnt[user] += 1
    
for user in train_users_game_cnt.keys():
    games_to_add = random.sample(games - games_played[user], train_users_game_cnt[user])
    for game in games_to_add:
        data_train.append([user, game])


y_train = [1]*(len(data_train)//2) + [0]*(len(data_train)//2)

# Shuffle two lists with same order 
# Using zip() + * operator + shuffle() 
temp = list(zip(data_train, y_train)) 
random.shuffle(temp) 
res1, res2 = zip(*temp)
data_train = list(res1)
y_train = list(res2)

Compute the Jaccard similarities


In [9]:
print("Incorporating both a Jaccard-based threshold and a popularity based threshold:")

def feature(datum, predictor_sets, most_played_sets):
    feat = [1]
    user, game = datum
    
    for ps in predictor_sets:
        popularity = 1 if game in ps else 0
        feat.append(popularity)
        
    for pls in most_played_sets:
        popularity = 1 if game in pls else 0
        feat.append(popularity)
    
    alphas = [0.5, 1]
    betas = [0.5, 1]
    for sim_a, sim_b in zip(alphas, betas):
        similarity_indeces = [0]
        for game_prime in games_played[user]:
            t = my_similarity(users_played[game], users_played[game_prime], sim_a, sim_b)
            if t < 1.0:
                similarity_indeces.append(t)
        similarity = max(similarity_indeces)
        feat.append(similarity)
    
    ###
    return [1, 1 if game in predictor_sets[0] else 0]
    
    return feat

predictor_set = [find_predictor_set(), find_predictor_set(.35)]
most_played_set = [find_most_played_set(), find_most_played_set(.35)]

Cs = [1000]
accs = []
print("Starting...")
for c in Cs:
    print("Finding train features...")
    X_train = [feature(d, predictor_set, most_played_set) for d in data_train]
    print("Train regressor...")
    mod = linear_model.LogisticRegressionCV(cv=5)  #C=1000
    print("Fitting regressor with train data...")
    mod.fit(X_train, y_train)

    print("Finding validation features...")
    X_valid = [feature(d, predictor_set, most_played_set) for d in data_valid]
    print("predicting...")
    pred = mod.predict(X_valid)
    accs.append(find_accuracy(pred, y_valid))
    
    print(accs)

max_index = accs.index(max(accs))
print("Best Accuracy: {}, with C={}".format(accs[max_index], Cs[max_index]))
print(accs)

Incorporating both a Jaccard-based threshold and a popularity based threshold:
Starting...
Finding train features...
Train regressor...
Fitting regressor with train data...
Finding validation features...
predicting...
[0.70365]
Best Accuracy: 0.70365, with C=1000
[0.70365]


In [10]:
X_valid

[[1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 1],
 [1, 0],
 

In [11]:
predictions = open("predictions_Played.txt", 'w')
for l in open("pairs_Played.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,g = l.strip().split('-')
    
    if mod.predict([feature([u, g], predictor_set, most_played_set)]) == [1]:
        predictions.write(u + '-' + g + ",1\n")
    else:
        predictions.write(u + '-' + g + ",0\n")

predictions.close()

In [12]:
data_valid[0]

['u49969792', 'b25961467']