In [None]:
from __future__ import division


import itertools
import pickle
import datetime
import hashlib
import locale
import numpy as np
import pycountry
import scipy.io as sio
import scipy.sparse as ss
import scipy.spatial.distance as ssd

from collections import defaultdict
from sklearn.preprocessing import normalize

#Try a few new libs
testIdMap = defaultdict(int)
testIdMap["patric"] = 82
testIdMap["john"] = 91
testIdMap

In [None]:
class DataCleaner:
  """
  Common utilities for converting strings to equivalent numbers
  or number buckets.
  """
  def __init__(self):
    # 载入 locales
    self.localeIdMap = defaultdict(int)
    for i, l in enumerate(locale.locale_alias.keys()):
      self.localeIdMap[l] = i + 1
    # 载入 countries
    self.countryIdMap = defaultdict(int)
    ctryIdx = defaultdict(int)
    for i, c in enumerate(pycountry.countries):
      self.countryIdMap[c.name.lower()] = i + 1
      if c.name.lower() == "usa":
        ctryIdx["US"] = i
      if c.name.lower() == "canada":
        ctryIdx["CA"] = i
    for cc in ctryIdx.keys():
      for s in pycountry.subdivisions.get(country_code=cc):
        self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
    # 载入 gender id 字典
    self.genderIdMap = defaultdict(int, {"male":1, "female":2})

  def getLocaleId(self, locstr):
    return self.localeIdMap[locstr.lower()]

  def getGenderId(self, genderStr):
    return self.ProgramEntities([genderStr])

  def getJoinedYearMonth(self, dateString):
    dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
    return "".join([str(dttm.year), str(dttm.month)])

  def getCountryId(self, location):
    if (isinstance(location, str)
        and len(location.strip()) > 0
        and location.rfind("  ") > -1):
      return self.countryIdMap[location[location.rindex("  ") + 2:].lower()]
    else:
      return 0

  def getBirthYearInt(self, birthYear):
    try:
      return 0 if birthYear == "None" else int(birthYear)
    except:
      return 0

  def getTimezoneInt(self, timezone):
    try:
      return int(timezone)
    except:
      return 0

  def getFeatureHash(self, value):
    if len(value.strip()) == 0:
      return -1
    else:
      return int(hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)

  def getFloatValue(self, value):
    if len(value.strip()) == 0:
      return 0.0
    else:
      return float(value)

In [None]:
cleaner = DataCleaner()
print("pyCountries " + str(pycountry.countries))
print("countryIdMap " + str(cleaner.countryIdMap))
print("genderIdMap " + str(cleaner.genderIdMap))
print("hashValue " + str(cleaner.getFeatureHash('PatricWan')))

In [4]:
class ProgramEntities:
  """
  我们只关心train和test中出现的user和event，因此重点处理这部分关联数据
  # 统计训练集中有多少独立的用户的events
  """
  def __init__(self):
    
    uniqueUsers = set()
    uniqueEvents = set()
    eventsForUser = defaultdict(set)
    usersForEvent = defaultdict(set)
    
    count = 0
    for filename in ["../../data/recommend/train.csv", "../../data/recommend/test.csv"]:
      f = open(filename, 'rb')
      f.readline().strip().decode().split(",")
      for line in f:
        cols = line.strip().decode().split(",")
        uniqueUsers.add(cols[0])
        uniqueEvents.add(cols[1])
        eventsForUser[cols[0]].add(cols[1])
        usersForEvent[cols[1]].add(cols[0])
        count = count + 1
        #if (count>300):
        #  break
      f.close()
    #print("uniqueUsers " +str(uniqueUsers))
    #print("uniqueEvents " +str(uniqueEvents))
    #print("eventsForUser " +str(eventsForUser))
    #print("usersForEvent " +str(usersForEvent))
    
    self.userEventScores = ss.dok_matrix((len(uniqueUsers), len(uniqueEvents)))
    
    self.userIndex = dict()
    self.eventIndex = dict()
   
    for i, u in enumerate(uniqueUsers):
      self.userIndex[u] = i
    for i, e in enumerate(uniqueEvents):
      self.eventIndex[e] = i
   
    ftrain = open("../../data/recommend/train.csv", 'rb')
    ftrain.readline()
    count = 0
    for line in ftrain:
      cols = line.strip().decode().split(",")
      i = self.userIndex[cols[0]]
      j = self.eventIndex[cols[1]]
      self.userEventScores[i, j] = int(cols[4]) - int(cols[5])
      count = count + 1
      #if (count>300):
      #   break
    ftrain.close()
    #print(self.userEventScores)
    
    sio.mmwrite("PE_userEventScores", self.userEventScores)
    # 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event
    # 所谓的关联用户，指的是至少在同一个event上有行为的用户pair
    # 关联的event指的是至少同一个user有行为的event pair
    # combinations('ABCD', 2)    AB AC AD BC BD CD
    self.uniqueUserPairs = set()
    self.uniqueEventPairs = set()
    for event in uniqueEvents:
      users = usersForEvent[event]
      if len(users) > 2:
        #print("users " + str(users))
        self.uniqueUserPairs.update(itertools.combinations(users, 2))
    
    #print("uniqueUserPairs " + str(self.uniqueUserPairs))
    for user in uniqueUsers:
      events = eventsForUser[user]
      if len(events) > 2:
        self.uniqueEventPairs.update(itertools.combinations(events, 2))
    
    #sio.mmwrite("PE_uniqueUserPairspw", self.uniqueUserPairs)
    #sio.mmwrite("PE_uniqueEventPairs", self.uniqueEventPairs)
    
    pickle.dump(self.userIndex, open("PE_userIndexpw.pkl", 'wb'))
    pickle.dump(self.eventIndex, open("PE_eventIndexpw.pkl", 'wb'))

In [5]:
programEntities = ProgramEntities()

In [6]:
class Users:
  """
  构建 user/user 相似度矩阵
  """
  def __init__(self, programEntities, sim=ssd.correlation):
    cleaner = DataCleaner()
    
    nusers = len(programEntities.userIndex.keys())
    fin = open("../../data/recommend/users.csv", 'rb')
    
    colnames = fin.readline().strip().decode().split(",")
    self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1))
    
    for line in fin:
      cols = line.strip().decode().split(",")
      # 只考虑train.csv中出现的用户
      if cols[0] in programEntities.userIndex:
        i = programEntities.userIndex[cols[0]]
        self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
        self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
        #self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])
        self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
        self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])
        self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])
    fin.close()
    
    # 归一化用户矩阵
    print(self.userMatrix)
    self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False)
    
    sio.mmwrite("US_userMatrix", self.userMatrix)
    
    # 计算用户相似度矩阵，之后会用到
    self.userSimMatrix = ss.dok_matrix((nusers, nusers))
    for i in range(0, nusers):
      self.userSimMatrix[i, i] = 1.0
    
    for u1, u2 in programEntities.uniqueUserPairs:
      i = programEntities.userIndex[u1]
      j = programEntities.userIndex[u2]
      if not (i, j) in self.userSimMatrix:
        usim = sim(self.userMatrix.getrow(i).todense(), self.userMatrix.getrow(j).todense())
        self.userSimMatrix[i, j] = usim
        self.userSimMatrix[j, i] = usim
    print(self.userSimMatrix)
    sio.mmwrite("US_userSimMatrix", self.userSimMatrix)

In [7]:
users = Users(programEntities)

  (1970, 0)	246.0
  (1970, 1)	1993.0
  (1970, 3)	201211.0
  (1970, 5)	-300.0
  (1358, 0)	136.0
  (1358, 1)	1993.0
  (1358, 3)	20127.0
  (1358, 5)	-240.0
  (793, 0)	246.0
  (793, 1)	1994.0
  (793, 3)	201211.0
  (793, 5)	420.0
  (1269, 0)	162.0
  (1269, 1)	1994.0
  (1269, 3)	201211.0
  (1269, 4)	64.0
  (1269, 5)	60.0
  (517, 0)	246.0
  (517, 1)	1994.0
  (517, 3)	201210.0
  (517, 4)	103.0
  (517, 5)	420.0
  (833, 0)	136.0
  (833, 1)	1981.0
  (833, 3)	20127.0
  :	:
  (184, 3)	20128.0
  (184, 5)	540.0
  (201, 0)	246.0
  (201, 1)	1952.0
  (201, 3)	20129.0
  (201, 4)	103.0
  (201, 5)	420.0
  (390, 0)	246.0
  (390, 1)	1994.0
  (390, 3)	201210.0
  (390, 4)	103.0
  (390, 5)	420.0
  (3331, 0)	136.0
  (3331, 1)	1996.0
  (3331, 3)	201210.0
  (3331, 5)	420.0
  (2546, 0)	136.0
  (2546, 1)	1996.0
  (2546, 3)	201210.0
  (2546, 5)	420.0
  (1802, 0)	246.0
  (1802, 1)	1994.0
  (1802, 3)	201210.0
  (1802, 4)	103.0
  (1802, 5)	420.0
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1

In [8]:
class UserFriends:
  """
  找出某用户的那些朋友，想法非常简单
  1)如果你有更多的朋友，可能你性格外向，更容易参加各种活动
  2)如果你朋友会参加某个活动，可能你也会跟随去参加一下
  """
  def __init__(self, programEntities):
    nusers = len(programEntities.userIndex.keys())
    self.numFriends = np.zeros((nusers))
    self.userFriends = ss.dok_matrix((nusers, nusers))
    fin = open("../../data/recommend/user_friends.csv", 'rb')
    fin.readline()                # skip header
    ln = 0
    for line in fin:
      if ln % 200 == 0:
        print("Loading line: ", ln)
      cols = line.strip().decode().split(",")
      user = cols[0]
      if user in programEntities.userIndex:
        friends = cols[1].split(" ")
        i = programEntities.userIndex[user]
        self.numFriends[i] = len(friends)
        for friend in friends:
          if friend in programEntities.userIndex:
            j = programEntities.userIndex[friend]
            # the objective of this score is to infer the degree to
            # and direction in which this friend will influence the
            # user's decision, so we sum the user/event score for
            # this user across all training events.
            eventsForUser = programEntities.userEventScores.getrow(j).todense()
            score = eventsForUser.sum() / np.shape(eventsForUser)[1]
            self.userFriends[i, j] += score
            self.userFriends[j, i] += score
      ln += 1
    fin.close()
    # 归一化数组
    sumNumFriends = self.numFriends.sum(axis=0)
    self.numFriends = self.numFriends / sumNumFriends
    sio.mmwrite("UF_numFriends", np.matrix(self.numFriends))
    self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False)
    sio.mmwrite("UF_userFriends", self.userFriends)

In [9]:
userFriends = UserFriends(programEntities)
print(userFriends.numFriends)
print(userFriends.userFriends)

Loading line:  0
Loading line:  200
Loading line:  400
Loading line:  600
Loading line:  800
Loading line:  1000
Loading line:  1200
Loading line:  1400
Loading line:  1600
Loading line:  1800
Loading line:  2000
Loading line:  2200
Loading line:  2400
Loading line:  2600
Loading line:  2800
Loading line:  3000
Loading line:  3200
Loading line:  3400
Loading line:  3600
Loading line:  3800
Loading line:  4000
Loading line:  4200
Loading line:  4400
Loading line:  4600
Loading line:  4800
Loading line:  5000
Loading line:  5200
Loading line:  5400
Loading line:  5600
Loading line:  5800
Loading line:  6000
Loading line:  6200
Loading line:  6400
Loading line:  6600
Loading line:  6800
Loading line:  7000
Loading line:  7200
Loading line:  7400
Loading line:  7600
Loading line:  7800
Loading line:  8000
Loading line:  8200
Loading line:  8400
Loading line:  8600
Loading line:  8800
Loading line:  9000
Loading line:  9200
Loading line:  9400
Loading line:  9600
Loading line:  9800
Loading

In [10]:
class Events:
  """
  构建event-event相似度，注意这里有2种相似度：
  1）由用户-event行为，类似协同过滤算出的相似度
  2）由event本身的内容(event信息)计算出的event-event相似度
  """
  def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
    cleaner = DataCleaner()
    fin = open("../../data/recommend/events.csv", 'rb')
    fin.readline() # skip header
    nevents = len(programEntities.eventIndex.keys())
    self.eventPropMatrix = ss.dok_matrix((nevents, 7))
    self.eventContMatrix = ss.dok_matrix((nevents, 100))
    ln = 0
    for line in fin.readlines():
#      if ln > 10:
#        break
      cols = line.strip().decode().split(",")
      eventId = cols[0]
      if eventId in programEntities.eventIndex:
        i = programEntities.eventIndex[eventId]
        self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time
        self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3]) # city
        self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4]) # state
        self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5]) # zip
        self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6]) # country
        self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat
        self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon
        for j in range(9, 109):
          self.eventContMatrix[i, j-9] = cols[j]
        ln += 1
    fin.close()
    self.eventPropMatrix = normalize(self.eventPropMatrix,
        norm="l1", axis=0, copy=False)
    sio.mmwrite("EV_eventPropMatrix", self.eventPropMatrix)
    
    self.eventContMatrix = normalize(self.eventContMatrix,
        norm="l1", axis=0, copy=False)
    sio.mmwrite("EV_eventContMatrix", self.eventContMatrix)
    
    # calculate similarity between event pairs based on the two matrices    
    self.eventPropSim = ss.dok_matrix((nevents, nevents))
    self.eventContSim = ss.dok_matrix((nevents, nevents))
    
    for e1, e2 in programEntities.uniqueEventPairs:
      i = programEntities.eventIndex[e1]
      j = programEntities.eventIndex[e2]
      if not (i,j) in self.eventPropSim:
        epsim = psim(self.eventPropMatrix.getrow(i).todense(),  self.eventPropMatrix.getrow(j).todense())
        self.eventPropSim[i, j] = epsim
        self.eventPropSim[j, i] = epsim
      if not (i,j) in self.eventContSim:
        ecsim = csim(self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
        self.eventContSim[i, j] = epsim
        self.eventContSim[j, i] = epsim
    sio.mmwrite("EV_eventPropSim", self.eventPropSim)
    sio.mmwrite("EV_eventContSim", self.eventContSim)

In [11]:
events = Events(programEntities)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [12]:
class EventAttendees():
  """
  统计某个活动，参加和不参加的人数，从而为活动活跃度做准备
  """
  def __init__(self, programEvents):
    nevents = len(programEvents.eventIndex.keys())
    self.eventPopularity = ss.dok_matrix((nevents, 1))
    f = open("../../data/recommend/event_attendees.csv", 'rb')
    f.readline() # skip header
    for line in f:
      cols = line.strip().decode().split(",")
      eventId = cols[0]
      if eventId in programEvents.eventIndex:
        i = programEvents.eventIndex[eventId]
        self.eventPopularity[i, 0] = len(cols[1].split(" ")) - len(cols[4].split(" "))
    f.close()
    self.eventPopularity = normalize(self.eventPopularity, norm="l1", axis=0, copy=False)
    sio.mmwrite("EA_eventPopularity", self.eventPopularity)

In [13]:
 eventAttendees = EventAttendees(programEntities)

In [14]:
# 这是构建特征部分
from __future__ import division

import pickle
import numpy as np
import scipy.io as sio

class DataRewriter:
  def __init__(self):
    # 读入数据做初始化
    self.userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
    self.eventIndex = pickle.load(open("PE_eventIndex.pkl", 'rb'))
    self.userEventScores = sio.mmread("PE_userEventScores").todense()
    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
    self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
    self.eventContSim = sio.mmread("EV_eventContSim").todense()
    self.numFriends = sio.mmread("UF_numFriends")
    self.userFriends = sio.mmread("UF_userFriends").todense()
    self.eventPopularity = sio.mmread("EA_eventPopularity").todense()
    
  def userReco(self, userId, eventId):
    """
    根据User-based协同过滤，得到event的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    #got j+1 col
    vs = self.userEventScores[:, j]
    
    #got row i+1
    sims = self.userSimMatrix[i, :]
    prod = sims * vs
    try:
      return prod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      return 0

  def eventReco(self, userId, eventId):
    """
    根据基于物品的协同过滤，得到Event的推荐度
    基本的伪代码思路如下：
    for item i 
      for every item j tht u has a preference for
        compute similarity s between i and j
        add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    js = self.userEventScores[i, :]
    
    psim = self.eventPropSim[:, j]
    csim = self.eventContSim[:, j]
    
    pprod = js * psim
    cprod = js * csim
    pscore = 0
    cscore = 0
    try:
      pscore = pprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    try:
      cscore = cprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    return pscore, cscore

  def userPop(self, userId):
    """
    基于用户的朋友个数来推断用户的社交程度
    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
    """
    if userId in self.userIndex:
      i = self.userIndex[userId]
      try:
        return self.numFriends[0, i]
      except IndexError:
        return 0
    else:
      return 0

  def friendInfluence(self, userId):
    """
    朋友对用户的影响
    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
    """
    nusers = np.shape(self.userFriends)[1]
    i = self.userIndex[userId]
    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

  def eventPop(self, eventId):
    """
    本活动本身的热度
    主要是通过参与的人数来界定的
    """
    i = self.eventIndex[eventId]
    return self.eventPopularity[i, 0]

  def rewriteData(self, start=1, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    fn = "../../data/recommend/train.csv" if train else "../../data/recommend/test.csv"
    fin = open(fn, 'r')
    fout = open(fn + "_data.csv", 'w')
    # write output header
    if header:
      ocolnames = ["invited", "user_reco", "evt_p_reco",
        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
      if train:
        ocolnames.append("interested")
        ocolnames.append("not_interested")
      fout.write(",".join(ocolnames) + "\n")
    ln = 0
    for line in fin:
      ln += 1
      if ln < start:
        continue
      cols = line.strip().split(",")  #.decode()
      userId = cols[0]
      eventId = cols[1]
      invited = cols[2]
      if ln%500 == 0: dvscax 
          print("%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
      user_reco = self.userReco(userId, eventId)
      evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)
      user_pop = self.userPop(userId)
      frnd_infl = self.friendInfluence(userId)
      evt_pop = self.eventPop(eventId)
      ocols = [invited, user_reco, evt_p_reco,
        evt_c_reco, user_pop, frnd_infl, evt_pop]
      if train:
        ocols.append(cols[4]) # interested
        ocols.append(cols[5]) # not_interested
      fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
    fin.close()
    fout.close()

  def rewriteTrainingSet(self):
    self.rewriteData(True)

  def rewriteTestSet(self):
    self.rewriteData(False)

# When running with cython, the actual class will be converted to a .so
# file, and the following code (along with the commented out import below)
# will need to be put into another .py and this should be run.

#import CRegressionData as rd

dr = DataRewriter()
print("Transform training data...\n")
dr.rewriteData(train=True, start=2, header=True)
print("Transform test data...\n")
dr.rewriteData(train=False, start=2, header=True)
print(dr.userIndex)


IndentationError: unexpected indent (<ipython-input-14-eaadf0b8198b>, line 133)

In [None]:
print(dr.eventIndex)

In [None]:
print(dr.userEventScores)

In [None]:
print(dr.userSimMatrix)

In [None]:
print(dr.eventPropSim)

In [None]:
print(dr.eventContSim)

In [None]:
print(dr.numFriends)

In [None]:
print(dr.userFriends)

In [None]:
print(dr.eventPopularity)