# Cluster Composition

In [8]:
import os, sys
import pandas as pd
import numpy as np
import glob
import pickle
import pylab as plt
import matplotlib
from matplotlib.pyplot import get_cmap
%matplotlib inline  
from scipy.spatial.distance import cdist

src_dir = os.path.abspath(os.path.join(os.pardir,'personality-types-orig'))
sys.path[0] = src_dir

In [12]:
#########################################################
## data on gender and age
path_read = 'data_filter/'
fname_read = 'df_demo_ipip300-no0.csv'

filename = os.path.join(src_dir,path_read,fname_read)
df = pd.read_csv(filename,index_col=0).dropna()

arr_g = np.array(df['gender'].values).astype('float')
arr_a = df['age'].values
N = len(arr_a)

list_g = np.array([0,1])
list_a = np.arange(10,105,5)
delta_a = list_a[1]-list_a[0]

## get the global stats
## distribution of age and gender
list_arr_p_ga = []
list_arr_n_ga = []

for g in list_g:
    inds_g_sel = np.where(arr_g==g)[0]
    arr_a_sel = arr_a[inds_g_sel]
    list_n_ga = []
    for i_a in range(len(list_a)-1):
        a1 = list_a[i_a]
        a2 = list_a[i_a+1]
        n_ga = np.sum( (arr_a_sel>=a1)*(arr_a_sel<a2)   )
        list_n_ga+=[n_ga]
    list_arr_p_ga += [np.array(list_n_ga)/N]
    list_arr_n_ga += [list_n_ga]

#########################################################
### Load position of people in trait space
path_read = 'data_filter/'
fname_read = 'ipip300-no0_arr_pd_neoac_score-1.npy'
filename = os.path.join(src_dir,path_read,fname_read)

arr_pd = np.load(filename)
N_,D_ = np.shape(arr_pd)
## z-score data
for d in range(D_):
    x_tmp = arr_pd[:,d]
    x_mu = np.mean(x_tmp)
    x_std = np.std(x_tmp)
    arr_pd[:,d] = (x_tmp - x_mu)/x_std

    
    
    
#####################################################
## cluster-positions as described in paper

c_names = ['``Average"','``Self-centered"','``Reserved"',' ``Role Model" ']
arr_cd_dom = np.zeros((4,5))
arr_cd_dom[0,:] = np.array([ 0.55,0.51,-0.60,0.25,0.25 ])
arr_cd_dom[1,:] = np.array([ -0.23,0.70,-0.75,-0.56,-0.44 ])
arr_cd_dom[2,:] = np.array([ -0.54,-0.07,-0.77,0.12,0.19 ])
arr_cd_dom[3,:] = np.array([ -0.70,0.52,0.23,0.62,0.78 ])


## distance of each person to each cluster
S = cdist(arr_pd,arr_cd_dom)


###################################################
## plot
D_crit = np.sqrt(5)
D_crit = 1.5




for i_n in range(4):

    list_arr_n_ga_sel = []
    list_arr_n_ga_sel_null_mu = []
    list_arr_n_ga_sel_null_sigma = []

    ## number of people around the cluster center with distance <= D_crit
    arr_D_pc = S[:,i_n]
    ind_sel = np.where( arr_D_pc <= D_crit )[0]
    N_sel = len(ind_sel)
    for i_g,g in enumerate(list_g):
        inds_g_sel = np.where( (arr_g==g)&(arr_D_pc <= D_crit) )[0]
        arr_a_sel = arr_a[inds_g_sel]
        list_n_ga = []
        for i_a in range(len(list_a)-1):
            a1 = list_a[i_a]
            a2 = list_a[i_a+1]
            n_ga = np.sum( (arr_a_sel>=a1)*(arr_a_sel<a2)   )
            list_n_ga+=[n_ga]


        ## observed number of people in cluster with g and a
        arr_n_ga = np.array(list_n_ga)
        list_arr_n_ga_sel += [arr_n_ga]
        ## expected number of people in cluster with g and a
        ## drawing N_sel people with global probability of g and a
        arr_p_ga_null = list_arr_p_ga[i_g]
        arr_n_ga_null_mu = N_sel*arr_p_ga_null

        arr_n_ga_null_sigma = np.sqrt(N_sel*arr_p_ga_null*(1.0-arr_p_ga_null) )
        list_arr_n_ga_sel_null_mu += [arr_n_ga_null_mu]
        list_arr_n_ga_sel_null_sigma += [arr_n_ga_null_sigma]      



In [14]:
s = pd.DataFram(S,names=''

array([[3.19736164, 3.45960754, 2.73486238, 1.66667317],
       [2.07514526, 0.82570678, 1.54068235, 2.53000353],
       [3.17380689, 2.91820421, 2.6059461 , 2.40622851],
       ...,
       [2.36423164, 2.23252673, 1.89632943, 1.58573907],
       [3.04025596, 2.04007592, 2.74186619, 3.10708287],
       [3.3627608 , 3.48195311, 3.54831288, 2.51810295]])