In [1]:
import sys,os
import numpy
import heapq
import time
import json
from sklearn.feature_extraction import DictVectorizer

In [2]:
class PIXNET_Recommendation:
    
    def __init__(self,training_data_folder="data/training",test_data_file_path="data/testing.json",target_file_path="tmp/recommendation",k=20):
        self.training_data_folder = training_data_folder
        self.test_data_file_path = test_data_file_path
        self.target_file_path = target_file_path
        self.k = k
    
    def load_data(self):
        data_set ={}
        training_files = os.listdir(self.training_data_folder)
        for filename in training_files:
            for line in open("%s/%s"%(self.training_data_folder,filename)):
                data_json = json.loads(line.strip())
                if data_json['cookie_pta'] not in data_set:
                    data_set[data_json['cookie_pta']]={}
                data_set[data_json['cookie_pta']][data_json['author_id']] = 1
        v = DictVectorizer(sparse=True)
        X = v.fit_transform(data_set.values()).T
        author = v.get_feature_names()
        self.data_set =data_set
        self.X = X
        self.author = author
    
    def calculate_item_similarity(self):
        X = self.X
        author = self.author
        distance = sum(X.T.toarray())
        similarity_tmp_file = open("tmp/similarity_top10","w")
        count = 0
        size_per_loop =1000
        while count< X.shape[0]:
            size_max = min(count+size_per_loop,X.shape[0])-count
            similarity = X[count:min(count+size_per_loop,X.shape[0])].dot(X.T).toarray()
            for i in xrange(size_max):
                similarity[i] = similarity[i]/numpy.sqrt(distance[count+i])
            similarity = similarity.T
            for i in xrange(X.shape[0]):
                similarity[i] = numpy.round(similarity[i]/numpy.sqrt(distance[i]),2)
            similarity = similarity.T
            for i in xrange(size_max):
                rank = []
                for j in xrange(X.shape[0]):
                    if similarity[i][j] >0.99:
                        similarity[i][j] = 0
                    rank.append(similarity[i][j])
                top_k_similarity = heapq.nlargest(self.k, xrange(len(rank)), rank.__getitem__)
                data =[]
                for j in xrange(len(top_k_similarity)):
                    data.append("%s,%.2f"%(author[top_k_similarity[j]],rank[top_k_similarity[j]]))
                similarity_tmp_file.write(";".join(data)+"\n")
            del similarity
            count+=size_per_loop
        similarity_tmp_file.close()
    
    def load_recommend_target(self):
        target_file = open(self.test_data_file_path)
        questions = {}
        for i in target_file:
            j =json.loads(i)
            questions[j['cookie_pta']] = 1
        target_file.close()
        self.questions= questions
    
    def recommend_from_item_similarity(self):
        data_set =self.data_set
        questions = self.questions
        author = self.author
        f = open("tmp/similarity_top10")
        count =0
        author_recommend = {}
        for i in f:
            r =i.strip().split(";")
            author_recommend[author[count]] = {}
            for j in r:
                a = j.split(",")
                author_recommend[author[count]][a[0]] = float(a[1])
            count +=1
        f_out = open(self.target_file_path,"w")
        for i in questions.keys():
            log = data_set[i]
            recommend_author = {}
            for author_id in log.keys():
                for r_author in author_recommend[author_id]:
                    if r_author in log:
                        continue
                    if r_author not in recommend_author:
                        recommend_author[r_author] = 0
                    recommend_author[r_author] += author_recommend[author_id][r_author]
            js = {"cookie_pta":i,"author_id":heapq.nlargest(len(recommend_author)/20,recommend_author,key=recommend_author.__getitem__)}
            f_out.write(json.dumps(js)+"\n")
        f_out.close()

    def main(self):
        self.load_data()
        self.calculate_item_similarity()
        self.load_recommend_target()
        self.recommend_from_item_similarity()

In [3]:
recommendation = PIXNET_Recommendation()
recommendation.main()