In [28]:
import pymongo, os, bson, json, glob, pickle, gridfs, sys, argparse, pickle
from collections import Counter
from sklearn.preprocessing import normalize
from random import sample
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.sparse import csr_matrix, random
from bson.binary import Binary
from os.path import join as opj
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn import metrics
from collections import OrderedDict
from bson.son import SON
from pprint import pprint

#Import utilities
sys.path.append("..")
import dbfind #db search tool
import utils
db = dbfind.db 


In [20]:
print(db.list_collection_names())

['tfidf_matrix.files', 'maplogs', 'item_embeddings', 'tech_tree', 'lifelogs', 'item_links_demo', 'objects', 'randomized_job_matrix.chunks', 'random_avatar_embeddings', 'expanded_transitions', 'avatar_embeddings', 'tfidf_matrix.chunks', 'transitions', 'activity_matrix.files', 'activity_matrix.chunks', 'cleaned_job_matrix.chunks', 'cleaned_job_matrix.files', 'item_interactions', 'nmf_validation', 'random_item_embeddings', 'activity_labels', 'categories', 'randomized_job_matrix.files']


# Goal: create a DF with the following columns

- Family
- Time (hour intervals)
- Target tech
- Proximity to repertoire
- Diversity
- Developed? (y/n)


# Construct family demographics

In [119]:
#create a list of all families (within range of avatar ids available in the jobmatrix)
familyList = [x for x in db.lifelogs.find({'avatar':{'$gt':2276905, '$lt':4107654}}).distinct('family')] #slow
len(familyList)

55339

In [120]:
#DEBUG: subsample to 1000 families for testing
familyList = sample(familyList,100)

In [121]:
#now aggregate into an ordered list 
famAgg = db.lifelogs.aggregate([{'$match':{'family':{"$in":familyList}}}, #This line can be removed if not subsampling
                              {'$group':{'_id':'$family', 'count':{'$sum':1}}}, #Group by family and count the number of avatars
                              {"$sort": SON([("count", -1), ("_id", -1)])}]) #sort

In [122]:
famDict ={}
for q in famAgg:
    print(q)
    famDict[q['_id']] = q['count']
famDict

{'_id': 'time-1596110958_eve-3215639_name-(missing)', 'count': 1175}
{'_id': 'time-1597353672_eve-3251667_name-PERSON', 'count': 561}
{'_id': 'time-1602432145_eve-3439317_name-BURNS', 'count': 488}
{'_id': 'time-1610485043_eve-3871285_name-KAWSKI', 'count': 378}
{'_id': 'time-1605199499_eve-3528895_name-LOVE', 'count': 198}
{'_id': 'time-1596149475_eve-3216925_name-US', 'count': 104}
{'_id': 'time-1594958274_eve-3181492_name-PRIMA', 'count': 75}
{'_id': 'time-1592495380_eve-3085648_name-SHARPES', 'count': 64}
{'_id': 'time-1600739279_eve-3363147_name-INFIELD', 'count': 11}
{'_id': 'time-1593291645_eve-3114126_name-(missing)', 'count': 9}
{'_id': 'time-1593282327_eve-3112938_name-(missing)', 'count': 4}
{'_id': 'time-1611221137_eve-3900056_name-TREE', 'count': 3}
{'_id': 'time-1600646023_eve-3352706_name-(missing)', 'count': 3}
{'_id': 'time-1593282237_eve-3112922_name-(missing)', 'count': 3}
{'_id': 'time-1593257982_eve-3110542_name-(missing)', 'count': 3}
{'_id': 'time-1587149601_eve-

{'time-1596110958_eve-3215639_name-(missing)': 1175,
 'time-1597353672_eve-3251667_name-PERSON': 561,
 'time-1602432145_eve-3439317_name-BURNS': 488,
 'time-1610485043_eve-3871285_name-KAWSKI': 378,
 'time-1605199499_eve-3528895_name-LOVE': 198,
 'time-1596149475_eve-3216925_name-US': 104,
 'time-1594958274_eve-3181492_name-PRIMA': 75,
 'time-1592495380_eve-3085648_name-SHARPES': 64,
 'time-1600739279_eve-3363147_name-INFIELD': 11,
 'time-1593291645_eve-3114126_name-(missing)': 9,
 'time-1593282327_eve-3112938_name-(missing)': 4,
 'time-1611221137_eve-3900056_name-TREE': 3,
 'time-1600646023_eve-3352706_name-(missing)': 3,
 'time-1593282237_eve-3112922_name-(missing)': 3,
 'time-1593257982_eve-3110542_name-(missing)': 3,
 'time-1587149601_eve-2903025_name-(missing)': 3,
 'time-1600571684_eve-3342173_name-(missing)': 2,
 'time-1593278738_eve-3112454_name-(missing)': 2,
 'time-1593275553_eve-3111966_name-(missing)': 2,
 'time-1591016606_eve-3040493_name-(missing)': 2,
 'time-1615689315_e

# Diversity

In [137]:
#test using a single family
fam =list(famDict.keys())[5]
avatarIds = [x[0] for x in dbfind.avatar(fam, target='family', fields = ['avatar'])]
dbfind.avatarVec(avatarIds)

[]

In [138]:
avatarIds

[3217743,
 3217723,
 3217717,
 3217713,
 3217689,
 3217686,
 3217671,
 3217667,
 3217656,
 3217646,
 3217630,
 3217608,
 3217598,
 3217596,
 3217578,
 3217572,
 3217570,
 3217566,
 3217565,
 3217559,
 3217557,
 3217553,
 3217545,
 3217542,
 3217533,
 3217527,
 3217522,
 3217514,
 3217506,
 3217493,
 3217486,
 3217483,
 3217478,
 3217462,
 3217440,
 3217393,
 3217383,
 3217360,
 3217346,
 3217345,
 3217338,
 3217335,
 3217327,
 3217322,
 3217318,
 3217317,
 3217305,
 3217302,
 3217299,
 3217295,
 3217294,
 3217293,
 3217291,
 3217278,
 3217273,
 3217268,
 3217264,
 3217260,
 3217259,
 3217256,
 3217253,
 3217245,
 3217244,
 3217205,
 3217202,
 3217171,
 3217165,
 3217150,
 3217147,
 3217146,
 3217142,
 3217140,
 3217132,
 3217131,
 3217127,
 3217126,
 3217118,
 3217113,
 3217110,
 3217102,
 3217097,
 3217087,
 3217085,
 3217084,
 3217083,
 3217080,
 3217073,
 3217072,
 3217068,
 3217058,
 3217043,
 3217040,
 3217039,
 3217035,
 3217033,
 3217013,
 3216997,
 3216968,
 3216962,
 3216960,


In [139]:
#Compute diversity for each family
diversityDict = {}
for fam in famDict.keys():
    avatarIds = [x[0] for x in dbfind.avatar(fam, target='family', fields = ['avatar'])]
    avatarVec = dbfind.avatarVec(avatarIds) #TODO: convert to array
    #TODO normalize each avatar Vec
    meanVec = np.mean(avatarVec, axis=0) #Check that the axis direction is correct
    diversity = []
    for (avatar in 1:len(avatarIds)):
        diversity.append(np.abs(avatarVec[avatar] - meanVec)) #Krugman specialization
    diversityDict[fam] = np.mean(diversity)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
