In [2]:
%matplotlib inline
import networkx as nx

In [None]:
def readfeaturelist(filename):
    """
    reads a featurelist file and returns a list of the feature names
    """
    with open(filename) as f:
        out = []        # list of feature names
        for line in f:
            out.append(line.strip())
        return out

In [None]:
features = readfeaturelist('featureList.txt')

In [None]:
features

In [None]:
def readfeatures(featurefile):
    """
    reads a featurefile consisting of userid feature;value feature;value
    returns a list where index is user id, elements are dictionaries 
    of features as keys pointing to list of values maybe should be sets
    """
    with open(featurefile) as f:
        out = [] 
        for line in f:
            tokens = line.split()
            profile = {}  # empty profile for the user
            for tok in tokens[1:]:
                feature,val = tok.rsplit(';',1)
                val = int(val)
                if feature not in profile:
                    profile[feature]=[val]
                else:
                    profile[feature].append(val)
            out.append(profile)
        for i in range(len(out)):
            assert out[i]['id'][0] == i  # check that each line was read and placed in the correct place in the list
        return out

In [None]:
profile = readfeatures('features.txt')

In [None]:
profile[3]

In [None]:
def featurematch(profile1,profile2,feature):
    """
    returns how well profile1 and profile2 match on a given of feature
    currently returns the number of items they have in common for that given feature
    """
    return len(set(profile1[feature]).intersection(set(profile2[feature]))) if feature in profile1 and feature in profile2 else 0

In [None]:
featurematch(profile[0],profile[1],'education;school;id')

In [None]:
def matchvector(profile1,profile2,featurelist):
    """
    given two profiles and a featurelist, returns the similarity vector for the two
    profiles where each entry is the number of entries they have in common for that feature,
    i.e. returns 2 if they went to the same two school ids
    """
    out = []
    for feature in featurelist:
        out.append(featurematch(profile1,profile2,feature))
    return out

In [None]:
mvec = matchvector(profile[0],profile[1],features)
mvec

In [None]:
a=[1,2,4,5]
b=[4,3,2,1]
np.inner(a,b)

In [None]:
def weighteddotproduct(vector1,vector2,weight=None):
    """
    returns the dot product of vector1 and vector2 with weight vector weight (normalized)
    """
    if not weight:
        weight = ones(len(vector1))
    return np.inner(vector1,np.multiply(weight,vector2))/mean(weight)

In [None]:
weighteddotproduct(a,a,b)

In [None]:
np.inner(b,a)

In [None]:
sqrt(sum(b))

In [None]:
weighteddotproduct(b,b,a)

In [None]:
weighteddotproduct(b,b)

In [None]:
np.inner(b,b)

In [None]:
weighteddotproduct(mvec,mvec)

In [None]:
def userfeatures(profile):
    """  Returns a list of the features contained in the user profile """
    return [f for f in profile]

In [None]:
userfeatures(profile[3])

In [None]:
def usermatch(profile1,profile2):
    """ returns the match vector for profile2 using only profile1 features as a reference """
    return matchvector(profile1,profile2,userfeatures(profile1))

In [None]:
usermatch(profile[2],profile[3])

In [None]:
def readcircle(userID):
    """
    reads a circle for a given user consisting of circleDD: user1 user2 user3 ...
    and returns a dictionary of the circle['number']=[user1,user2,user3]
    """
    circlefile = './Training/'+str(userID)+'.circles'
    with open(circlefile) as f:
        circles = {} 
        for line in f:
            tokens = line.split()
            circleID = int(tokens[0].split('circle')[1].split(':')[0])
            circles[circleID] =[]
            for tok in tokens[1:]:
                circles[circleID].append(int(tok))
        return circles

The entire analysis for determining the characteristic profile of a circle should be redone and cleaned up

In [None]:
circles239 = readcircle(239)

In [None]:
circles239[16]

In [None]:
matchmatrix_u239_c16 = [matchvector(profile[239],profile[user],features) for user in circles239[16] ]

In [None]:
zeros(len(matchmatrix_u239_c16[0]))

In [None]:
def charprofile(profilemat):
    out = zeros(len(profilemat[0]))
    for row in profilemat:
        for i in range(len(row)):
            out[i] += row[i]
    return out
    

In [None]:
char_u239_c16 = charprofile(matchmatrix_u239_c16)
char_u239_c16

In [None]:
print features[20],char_u239_c16[20]
print features[27],char_u239_c16[27]

gender 30.0
locale 43.0


In [None]:
def display_char_profile(charprofile,featurelist):
    for i in range(len(featurelist)):
        if charprofile[i] !=0: print featurelist[i],charprofile[i]

In [None]:
display_char_profile(char_u239_c16,features)

gender 30.0
last_name 1.0
locale 43.0


In [None]:
ref_user = 345
print 'User:', ref_user
ref_profile = profile[ref_user]
circles = readcircle(ref_user)
ch_profile={}
for circle in circles:
    print 'Circle:', circle
    matchmatrix = [matchvector(ref_profile,profile[user],features) for user in circles[circle] ]
    ch_profile[circle]=charprofile(matchmatrix)
    display_char_profile(ch_profile[circle],features)
    print ''
    

User: 345
Circle: 20
education;type 37.0
gender 18.0
hometown;id 4.0
hometown;name 4.0
languages;id 3.0
languages;name 4.0
last_name 2.0
locale 10.0
location;id 1.0
location;name 1.0

Circle: 22
education;school;id 70.0
education;school;name 70.0
education;type 143.0
first_name 3.0
gender 106.0


languages;id 145.0
languages;name 161.0
locale 109.0
location;id 45.0
location;name 45.0
religion 1.0

Circle: 24
education;type 5.0
gender 5.0
locale 4.0

Circle: 26
education;school;id 1.0
education;school;name 1.0
education;type 21.0
gender 17.0
hometown;id 3.0
hometown;name 3.0
languages;id 11.0
languages;name 11.0
locale 22.0
location;id 2.0
location;name 2.0

Circle: 27
education;type 14.0
gender 9.0
languages;id 2.0
languages;name 2.0
locale 15.0
location;id 2.0
location;name 2.0

Circle: 28
education;school;id 2.0
education;school;name 2.0
education;type 10.0
gender 5.0
languages;id 3.0
languages;name 4.0
locale 6.0

Circle: 29
education;school;id

 2.0
education;school;name 2.0
education;type 40.0
gender 29.0
hometown;id 10.0
hometown;name 10.0
languages;id 11.0
languages;name 12.0
locale 45.0
location;id 2.0
location;name 2.0



In [None]:
for circle in circles:
    print len(circles[circle])

30
145
6
27
16
7
48


The below analysis is of the size of circles in the entire circle training set.

In [None]:
import os
trainingfiles = os.listdir('./Training/')
alltraining ={}
for item in trainingfiles:
    ego = int((item.split('.')[0]))
    alltraining[ego]=readcircle(ego)

In [None]:
circlesizes =[]
for ego in alltraining:
    for circle in alltraining[ego]:
        circlesizes.append(len(alltraining[ego][circle]))

In [None]:
print 'Total Circles:',len(circlesizes),'Largest circle:',max(circlesizes)

Total Circles: 592 Largest circle: 335


In [None]:
n,bins,patches = plt.hist(circlesizes,50,normed=1)

Need to install powerlaw from https://pypi.python.org/pypi/powerlaw and mpmath from http://mpmath.org/

In [None]:
import powerlaw

fit = powerlaw.Fit(circlesizes,xmin=1.0)
print 'Power law parameters'
print 'xmin:', fit.xmin
print 'alpha:', fit.power_law.alpha
print 'sigma:', fit.power_law.sigma
print 'Kolmorgorov-Smirnov Distance:', fit.power_law.D
print 'Comparison of different fit distributions'
for fitname in fit.supported_distributions:
    if fit.supported_distributions[fitname]: print fitname, fit.distribution_compare('power_law', fitname ,normalized_ratio=True)
fig = fit.plot_pdf(color='b', linewidth=2)
fit.power_law.plot_pdf(color='b', linestyle='--', ax=fig)
fit.truncated_power_law.plot_pdf(color='r', linestyle='--', ax=fig)
show()

Power law parameters
xmin: 1.0
alpha: 1.34867031724
sigma: 0.0143302617645
Kolmorgorov-Smirnov Distance: 0.429454995946
Comparison of different fit distributions
lognormal 

(-31.161598650616888, 3.5324953511545982e-213)
exponential (-19.995760572020231, 5.9957783056119614e-89)
truncated_power_law Assuming nested distributions
(-52.163822814729087, 0.0)


stretched_exponential (-25.240484420591411, 1.4406321340792655e-140)
power_law Assuming nested distributions
(nan, 1.0)


In [None]:
fit = powerlaw.Fit(circlesizes)
print 'Power law parameters'
print 'xmin:', fit.xmin
print 'alpha:', fit.power_law.alpha
print 'sigma:', fit.power_law.sigma
print 'Kolmorgorov-Smirnov Distance:', fit.power_law.D
print 'Comparison of different fit distributions'
for fitname in fit.supported_distributions:
    if fit.supported_distributions[fitname]: print fitname, fit.distribution_compare('power_law', fitname ,normalized_ratio=True)
fig = fit.plot_pdf(color='b', linewidth=2)
fit.power_law.plot_pdf(color='b', linestyle='--', ax=fig)
fit.truncated_power_law.plot_pdf(color='r', linestyle='--', ax=fig)
show()

Calculating best minimal value for power law fit
Power law parameters
xmin: 59.0
alpha: 3.11161185683
sigma: 0.237574895157
Kolmorgorov-Smirnov Distance: 0.0683951528023
Comparison of different fit distributions
lognormal 

(-0.95854230434610788, 0.33778936996245745)
exponential (0.16468001936175994, 0.86919584448399489)
truncated_power_law Assuming nested distributions
(-1.1621434875610928, 0.10156380981369018)


stretched_exponential (-1.0051563440518123, 0.31482157271181099)
power_law Assuming nested distributions
(nan, 1.0)


#ProbCircleSize(N) ~= (N)<sup>-1.35 to -3.11</sup>

In [None]:
def read_nodeadjlist(filename):
  G = nx.Graph()
  for line in open(filename):
    e1, es = line.split(':')
    # Add a node for the user.
    G.add_node(int(e1))
    es = es.split()
    for e in es:
      if e == e1: continue
      G.add_edge(int(e1),int(e))
  return G

def drawGraph(G):
    pos = nx.spring_layout(G)
    nx.draw(G, pos)

In [None]:
import networkx as nx

In [None]:
ref_user=239
ref_circle = alltraining[ref_user]
ref_circle_nums = [c for c in ref_circle]
# ref_circle_nums = ref_circle.keys()

print 'User:',ref_user,'\nCircle #\'s:', ref_circle_nums
G = read_nodeadjlist('./egonets/'+str(ref_user)+'.egonet')
pos = nx.spring_layout(G)
figure(figsize(15, 15))
for (n, c) in enumerate(ref_circle_nums):
    subplot(2,2,n+1)
    print 'User:',ref_user,'Circle:',c,'Size:',len(ref_circle[c])
    print 'Members:',ref_circle[c]
    nx.draw(G,pos,node_color='r')  # draw the background graph as red
    nx.draw(G.subgraph(ref_circle[c]),pos,node_color='b')  # draw each subgraph as blue
    #show()  # plot each as a separate plot


User: 239 
Circle #'s: [16, 17, 18, 19]
User:

 239 Circle: 16 Size: 46
Members: [335, 253, 325, 320, 283, 258, 284, 318, 330, 241, 276, 312, 340, 301, 334, 291, 295, 242, 337, 338, 273, 277, 293, 251, 279, 285, 267, 243, 240, 298, 302, 286, 305, 282, 294, 256, 313, 263, 274, 339, 309, 336, 310, 264, 266, 269]
User:

 239 Circle: 17 Size: 27
Members: [306, 281, 311, 247, 342, 315, 297, 303, 304, 343, 333, 296, 250, 255, 331, 248, 340, 265, 322, 289, 344, 287, 326, 249, 324, 307, 290]
User:

 239 Circle: 18 Size: 19
Members: [300, 252, 332, 328, 268, 272, 323, 288, 257, 292, 245, 319, 275, 341, 282, 321, 339, 244, 316]
User:

 239 Circle: 19 Size: 13
Members: [280, 308, 254, 246, 270, 314, 299, 329, 261, 271, 327, 259, 317]


In [None]:
clust = nx.clustering(G)
figure(figsize(8,3))
hist(clust.values(), bins=20)

In [None]:
friendly_users = [user_id for user_id, coeff in clust.items() if coeff == 1]
figure(figsize(5,5))
nx.draw(G,pos,node_color='r')  # draw the background graph as red
nx.draw(G.subgraph(friendly_users),pos,node_color='b')  # draw each subgraph as blue

In [None]:
ref_user=345
ref_circle = alltraining[ref_user]
ref_circle_nums = [c for c in ref_circle]

print 'User:',ref_user
print 'Circle #\'s:', ref_circle_nums
print 'Circle sizes:', [len(ref_circle[c]) for c in ref_circle]
G = read_nodeadjlist('./egonets/'+str(ref_user)+'.egonet')
pos=nx.spring_layout(G) # positions for all nodes

ncolormap = {}
for c in ref_circle:
    ncolormap[c] = (random.random(),random.random(),random.random())
    # nodes
    nx.draw_networkx_nodes(G,pos,
                       nodelist=ref_circle[c],
                       node_color=ncolormap[c],
                       node_size=50,
                   alpha=0.8)
#edges
nx.draw_networkx_edges(G,pos,width=1.0,alpha=0.5)
show()

User: 345
Circle #'s: [20, 22, 24, 26, 27, 28, 29]
Circle sizes: [30, 145, 6, 27, 16, 7, 48]


In [None]:
G.nodes()

In [None]:
# check for nodes in a circle but not in G, determined bug in reading node adjacency list
for c in ref_circle:
    print 'Circle', c, 'has nodes', [n for n in ref_circle[c] if n not in set(G.nodes())], 'not in G'

Circle 20 has nodes [] not in G
Circle 22 has nodes [] not in G
Circle 24 has nodes [] not in G
Circle 26 has nodes [] not in G
Circle 27 has nodes [] not in G
Circle 28 has nodes [] not in G
Circle 29 has nodes [] not in G


In [None]:
ref_user = 345
print 'User:', ref_user
ref_profile = profile[ref_user]
circles = readcircle(ref_user)
ch_profile={}
for circle in circles:
    print 'Circle:', circle
    matchmatrix = [matchvector(ref_profile,profile[user],features) for user in circles[circle] ]
    ch_profile[circle]=charprofile(matchmatrix)
    display_char_profile(ch_profile[circle],features)
    print ''

User: 345
Circle: 20
education;type 37.0
gender 18.0
hometown;id 4.0
hometown;name 4.0
languages;id 3.0
languages;name 4.0
last_name 2.0
locale 10.0
location;id 1.0
location;name 1.0

Circle: 22
education;school;id 70.0
education;school;name 70.0
education;type 143.0
first_name 3.0
gender 106.0
languages;id 145.0
languages;name 

161.0
locale 109.0
location;id 45.0
location;name 45.0
religion 1.0

Circle: 24
education;type 5.0
gender 5.0
locale 4.0

Circle: 26
education;school;id 1.0
education;school;name 1.0
education;type 21.0
gender 17.0
hometown;id 3.0
hometown;name 3.0
languages;id 11.0
languages;name 11.0
locale 22.0
location;id 2.0
location;name 2.0

Circle: 27
education;type 14.0
gender 9.0
languages;id 2.0
languages;name 2.0
locale 15.0
location;id 2.0
location;name 2.0

Circle: 28
education;school;id 2.0
education;school;name 2.0
education;type 10.0
gender 5.0
languages;id 3.0
languages;name 4.0
locale 6.0

Circle: 29
education;school;id 2.0
education;school;name 2.0
education;type 40.0
gender 29.0
hometown;id 10.0
hometown;name 10.0
languages;id 11.0
languages;name 12.0
locale 45.0
location;id 2.0
location;name 2.0



In [None]:
ref_profile

In [None]:
c = 29 # the circle to construct a common profile, really need to clean up str/int inconsistency, all should be int
char_profile = {}
Ncircle = len(circles[c])  # the size of the circle
for key in ref_profile:
    for val in ref_profile[key]:
        count = 0
        for user in circles[c]:
          if key in profile[user] and val in profile[user][key]:
                count = count + 1
        count = count*1.0/Ncircle  # normalize by the size of the circle to determine percentage of profiles with common value
        if key in char_profile:
            char_profile[key].append( (val,count) )  # store the val and percentage matched to ego as tuples
        else:
            char_profile[key]=[(val,count)]
char_profile
        

In [None]:
1 in [2,3,4,1]

In [None]:
usersincircles = set.union(*[set(ref_circle[c]) for c in ref_circle])
H = nx.Graph()
H.add_nodes_from(G)
H.add_edges_from(G.edges())
H.remove_nodes_from(usersincircles)
H.nodes()

In [None]:
H = nx.Graph()
H.add_nodes_from(G)
for c in ref_circle:
    H.remove_nodes_from(ref_circle[c])
print 'Users not in a circle:',H.nodes()

Users not in a circle: [260, 262, 278]
