In [1]:
import gzip
from collections import defaultdict
import random
from sklearn.svm import SVC  
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, precision_score
from multiprocessing import Process, Queue

from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise import KNNWithMeans
import pandas as pd

def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

  from numpy.core.umath_tests import inner1d


In [2]:
### Would-purchase baseline: just rank which businesses are popular and which are not, and return '1' if a business is among the top-ranked
#Question 1,2,3,4
businessCount = defaultdict(int)
totalPurchases = 0
purchases = defaultdict(set)
purchasesTest = defaultdict(set)
items = set()
count = 0
allData = defaultdict(int)
allPurchases = 0
categoryTrain = defaultdict(set)
itemCategories = defaultdict(str)
for l in readGz("train.json.gz"):
    user,business = l['reviewerID'],l['itemID']
    allData[business] += 1
    allPurchases += 1
    if count <= 100000:
        businessCount[business] += 1
        totalPurchases += 1
        count += 1
    else:
        purchasesTest[l['reviewerID']].add(l['itemID'])
    purchases[l['reviewerID']].add(l['itemID'])
    categoryTrain[l['reviewerID']].add(l['categoryID'])
    itemCategories[l['itemID']] = l['categoryID']
    items.add(l['itemID'])

#Non-purchase pairs
negatives = defaultdict(set)
itemsList = list(items)
users = list(purchases.keys())
count = 0
while count < 100000:
    cus = random.choice(users)
    item = random.choice(itemsList)
    if item not in purchases[cus] and item not in negatives[cus]:
        negatives[cus].add(item)
        count += 1
mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()

mostPopAll = [(allData[x], x) for x in allData]
mostPopAll.sort()
mostPopAll.reverse()

validation = []
return1 = set()
for u in negatives.keys():
    for i in negatives[u]:
        validation.append((u,i,0))
for u in purchasesTest.keys():
    for i in purchasesTest[u]:
        validation.append((u,i,1))
factor = 0.5
count = 0
for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalPurchases*factor): break
correct = 0
for p in validation:
    if (p[1] in return1):
        if (p[2] == 1):
            correct += 1
    elif (p[2] == 0):
        correct += 1
print('Original model validation accuracy is: {}'.format(correct/len(validation)))
popular = 1
            
factor = 0.48
maxAcc = 0
maxFactor = 0
for k in range(1000):
    factor += 0.0001
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalPurchases*factor): break
    correct = 0
    for p in validation:
        if (p[1] in return1):
            if (p[2] == 1):
                correct += 1
        elif (p[2] == 0):
            correct += 1
    if ((correct/len(validation)) > maxAcc):
        maxAcc = (correct/len(validation))
        maxFactor = factor
print('Validation accuracy is: {} at factor {}'.format(maxAcc, maxFactor))


popular = 1 #Choose which model to use. (Category model doesn't output accuracy as not required by the question)
#Predicting with the popularity model
if (popular == 1):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalPurchases*maxFactor): break

    predictions = open("predictions_Purchase.txt", 'w')
    for l in open("pairs_Purchase.txt"):
        if l.startswith("reviewerID"):
        #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')
        if i in return1:
            predictions.write(u + '-' + i + ",1\n")
        else:
            predictions.write(u + '-' + i + ",0\n")
    predictions.close()
else:
    #Predicting with the category model
    predictions = open("predictions_Purchase.txt", 'w')
    for l in open("pairs_Purchase.txt"):
        if l.startswith("reviewerID"):
        #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')
        if itemCategories[i] in categoryTrain[u]:
            predictions.write(u + '-' + i + ",1\n")
        else:
            predictions.write(u + '-' + i + ",0\n")
    predictions.close()


Original model validation accuracy is: 0.628478142390712
Validation accuracy is: 0.6291681458407292 at factor 0.5151999999999961


In [None]:
#Alt solution for purchase prediction
userID = []
itemID = []
categoryID = []
ratingAll = []
purchased = []
for l in readGz("train.json.gz"):
    user,business,category,rating = l['reviewerID'],l['itemID'], l['categoryID'], l['rating']
    userID.append(user)
    itemID.append(business)
    categoryID.append(category)
    ratingAll.append(rating)
    purchased.append(1.0)
for user in negatives:
    for item in negatives[user]:
        userID.append(user)
        itemID.append(item)
        purchased.append(0.0)
allDataDict = {'reviewerID':userID,
              'itemID':itemID,
              'purchase':purchased}
df = pd.DataFrame(allDataDict)
reader = Reader(rating_scale=(0,1))
data = Dataset.load_from_df(df[['reviewerID','itemID','purchase']],reader)
trainset, testset = train_test_split(data, test_size=.15)
algo = KNNWithMeans(k=50, sim_options={'name':'pearson_baseline','user_based':True})
algo.fit(trainset)
test_pred = algo.test(testset)
accuracy.rmse(test_pred,verbose=True)

Estimating biases using als...


user: U490934656 item: I402344648 r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}


In [4]:
catDict = {
  "Women": 0,
  "Men": 1,
  "Girls": 2,
  "Boys": 3,
  "Baby": 4
}

In [7]:
### Category prediction baseline: Just consider some of the most common words from each category
#Question 5
def createList():
    return [0,0,0,0,0]

categoryCountsTrain = defaultdict(int)
userCategoryCountsTrain = defaultdict(createList)
categoryCountsVal = defaultdict(int)
userCategoryCountsVal = defaultdict(createList)
count = 0
allRatings = []
userRatings = defaultdict(list)
for l in readGz("train.json.gz"):
    if count < 100000:
        categoryCountsTrain[l['categoryID']] += 1
        userCategoryCountsTrain[l['reviewerID']][l['categoryID']] += 1
        count += 1
    else:
        categoryCountsVal[l['categoryID']] += 1
        userCategoryCountsVal[l['reviewerID']][l['categoryID']] += 1
    user,business = l['reviewerID'],l['itemID']
    allRatings.append(l['rating'])
    userRatings[user].append(l['rating'])

favCategoriesTrain = defaultdict(int)
favCategoriesVal = defaultdict(int)

for cus in userCategoryCountsTrain:
    userCategories = userCategoryCountsTrain[cus]
    favCatVal = max(userCategories)
    favCat = userCategories.index(favCatVal)
    if (userCategories.count(favCatVal) > 1):
        cats = []
        for vals in userCategories:
            if (vals == favCatVal):
                cats.append(userCategories.index(vals))
        for i in cats:
            maxCat = 0
            if (categoryCountsTrain[i] > categoryCountsTrain[maxCat]):
                maxCat = i
        favCat = maxCat
    favCategoriesTrain[cus] = favCat
    
for cus in userCategoryCountsVal:
    userCategories = userCategoryCountsVal[cus]
    favCatVal = max(userCategories)
    favCat = userCategories.index(favCatVal)
    if (userCategories.count(favCatVal) > 1):
        cats = []
        for vals in userCategories:
            if (vals == favCatVal):
                cats.append(userCategories.index(vals))
        for i in cats:
            maxCat = 0
            if (categoryCountsVal[i] > categoryCountsVal[maxCat]):
                maxCat = i
        favCat = maxCat
    favCategoriesVal[cus] = favCat
correct = 0
total = 0
for user in favCategoriesVal:
    total += 1
    if user in favCategoriesTrain.keys():
        predicted = favCategoriesTrain[user]
    else:
        predicted = 0
    real = favCategoriesVal[user]
    if (predicted == real):
        correct += 1
print('Accuracy of category prediction on validation set: {}'.format(correct/total))
    
predictions = open("predictions_Category.txt", 'w')
predictions.write("reviewerID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    cat = catDict['Women'] # If there's no evidence, just choose the most common category in the dataset
    words = l['reviewText'].lower()
    if 'wife' in words:
        cat = catDict['Women']
    if 'husband' in words:
        cat = catDict['Men']
    if 'daughter' in words:
        cat = catDict['Girls']
    if 'son' in words:
        cat = catDict['Boys']
    if 'baby' in words:
        cat = catDict['Baby']
    predictions.write(l['reviewerID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

predictions.close()

Accuracy of category prediction on validation set: 0.8457227624855778


In [None]:
#Question 6
import operator
import string
translator = str.maketrans('', '', string.punctuation)
def createDict():
    return defaultdict(int)
wordCount = defaultdict(int)
categoryWordCount = defaultdict(createDict)
count = 0
testSet = []
trainSet = []
for l in readGz("train.json.gz"):
    if count < 100000:
        sentence = l['reviewText'].translate(translator).lower()
        words = sentence.split()
        if 'categoryID' in l.keys():
            for word in words:
                wordCount[word] += 1
                categoryWordCount[l['categoryID']][word] += 1
            count += 1
            trainSet.append((l['reviewText'], l['categoryID']))
    else:
        if 'categoryID' in l.keys():
            testSet.append((l['reviewText'], l['categoryID']))
topWords = sorted(wordCount.items(), key=operator.itemgetter(1))
topWords.reverse()
topWords = topWords[:500]


total = sum(pair[1] for pair in topWords)
totalCounts = defaultdict(int)
frequencies = dict()
for pair in topWords:
    frequencies[pair[0]] = pair[1]/total
    totalCounts[pair[0]] = pair[1]
categoryFrequencies = defaultdict(list)
categoryTotals = defaultdict(int)
for categories in categoryWordCount.keys():
#     categoryWords = categoryWordCount[categories]
#     topWords = sorted(categoryWords.items(), key=operator.itemgetter(1))
#     topWords.reverse()
#     topWords = topWords[:(500 if len(topWords) > 500 else len(topWords))]
#     catTotal = sum(pair[1] for pair in topWords)
#     categoryTotals[categories] = catTotal
    topFreqInCat = defaultdict(float)
    categoryTotal = 0
    for word in topWords:
        categoryTotal += categoryWordCount[categories][word[0]]
    for word in topWords:
        topFreqInCat[word[0]] = categoryWordCount[categories][word[0]]/categoryTotal
    wordFrequencies = [(x,topFreqInCat[x] - frequencies[x]) for x in topFreqInCat.keys()]
    wordFrequencies.sort(key=operator.itemgetter(1))
    wordFrequencies.reverse()
    categoryFrequencies[categories] = wordFrequencies
for cat in categoryFrequencies.keys():
    printThis = [(pair[0]) for pair in categoryFrequencies[cat]]
    print("Words that are more frequent in {} category: {}".format(list(catDict)[cat],printThis))

In [7]:
#Question 7,8
from random import shuffle
checkForWords = [pair[0] for pair in topWords]
X_train = []
y_train = []
X_test = []
y_test = []
shuffle(trainSet)
maxCount = 10000
for i in range(maxCount):
    datum = trainSet[i]
    review = datum[0].translate(translator).lower()
    words = review.split()
    feature = []
    for word in checkForWords:
        if word in words:
            feature.append(1)
        else:
            feature.append(0)
    X_train.append(feature)
    y_train.append(datum[1])
for datum in testSet:
    review = datum[0].translate(translator).lower()
    words = review.split()
    feature = []
    for word in checkForWords:
        if word in words:
            feature.append(1)
        else:
            feature.append(0)
    X_test.append(feature)
    y_test.append(datum[1])

In [8]:
#Question 7,8

def trainSvm(category,reg,X_train,y_train,X_test,y_test,Q=Queue()):
    y_train_cat = [(1 if i==category else 0) for i in y_train]
    y_test_cat = [(1 if i==category else 0) for i in y_test]
    clf = SVC(kernel='linear', C=reg)
    clf.fit(X_train, y_train_cat)
    y_pred = clf.predict(X_test)
    precision = precision_score(y_test_cat,y_pred, average='micro')
    Q.put((category,reg,precision))
    return clf

  from numpy.core.umath_tests import inner1d


In [9]:
#Question 7,8
C_performance = Queue()
cats = [0,1,2,3,4]
C = [0.01,0.1,1,10,100]
maxPrecision = 0
maxC = 0
processes = []

for cat in cats:
    for c in C:
        p = Process(target=trainSvm,args=(cat,c,X_train,y_train,X_test,y_test,C_performance))
        p.start()
        processes.append(p)
for p in processes:
    p.join()

results = []
while (not C_performance.empty()):
    results.append(C_performance.get())
    
result_Cat = defaultdict(list)
for result in results:
    result_Cat[result[0]].append((result[1],result[2]))

regularizers = dict()
for cat in result_Cat.keys():
    pairs = result_Cat[cat]
    pairs.sort(key=operator.itemgetter(0))
    pairs.reverse()
    best = max(pairs,key=operator.itemgetter(1))
    regularizers[cat] = best[0]
    print('Best regularization in category {} is {} with accuracy: {}'.format(list(catDict)[cat],best[0],best[1]))

Best regularization in category Boys is 0.1 with accuracy: 0.99045
Best regularization in category Baby is 0.1 with accuracy: 0.98543
Best regularization in category Girls is 0.1 with accuracy: 0.98849
Best regularization in category Men is 0.1 with accuracy: 0.79523
Best regularization in category Women is 0.1 with accuracy: 0.78057


In [11]:
#Question 7,8
#Best values obtained from the tests above
C_cat0 = 0.1 #Women
C_cat1 = 0.1 #Men
C_cat2 = 0.1 #Girls
C_cat3 = 0.1 #Boys
C_cat4 = 0.1 #Baby

clf_cat0 = trainSvm(0,C_cat0,X_train,y_train,X_test,y_test)
clf_cat1 = trainSvm(1,C_cat1,X_train,y_train,X_test,y_test)
clf_cat2 = trainSvm(2,C_cat2,X_train,y_train,X_test,y_test)
clf_cat3 = trainSvm(3,C_cat3,X_train,y_train,X_test,y_test)
clf_cat4 = trainSvm(4,C_cat4,X_train,y_train,X_test,y_test)

In [14]:
#Question 7,8
print('Precision of classifier in Women category with regularizer {}: {}'.format(C_cat0, precision_score(y_test,clf_cat0.predict(X_test),average='micro')))
      
def predictCombined(X, size):
    f0 = clf_cat0.decision_function(X[:size])
    f1 = clf_cat1.decision_function(X[:size])
    f2 = clf_cat2.decision_function(X[:size])
    f3 = clf_cat3.decision_function(X[:size])
    f4 = clf_cat4.decision_function(X[:size])

    confidence = []
    for i in range(len(X[:size])):
        confidence.append((f0[i],f1[i],f2[i],f3[i],f4[i]))
    y_pred = []
    for i in range(len(confidence)):
        y_pred.append(confidence[i].index(max(confidence[i])))
    return y_pred
testSize = 1000
print('Precision of the combined classifier: {}'.format(precision_score(y_test[:testSize],predictCombined(X_test,testSize), average='micro')))

finalFeatures = []
finalLabels = []
predictions = open("predictions_Category.txt", 'w')
predictions.write("reviewerID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    words = l['reviewText'].translate(translator)
    words.lower()
    feature = []
    for word in checkForWords:
        if word in words:
            feature.append(1)
        else:
            feature.append(0)
    finalFeatures.append(feature)
    finalLabels.append((l['reviewerID'],l['reviewHash']))
    
pred = predictCombined(finalFeatures, len(finalFeatures))
i = 0
for (revID,revHash) in finalLabels:
    predictions.write(revID + '-' + revHash + "," + str(pred[i]) + "\n")
    i += 1

predictions.close()

Precision of classifier in Women category with regularizer 0.1: 0.19616
Precision of the combined classifier: 0.771
