In [33]:
import numpy as np
import pandas as pd
from scipy.linalg import fractional_matrix_power
from sklearn.linear_model import LinearRegression


In [34]:

cov_df=pd.read_csv('../../data/cov_mat_2023-04-28.csv', index_col=0)
cov_df_V= cov_df.filter(regex='^V')

user_id2V=cov_df_V.T.to_dict()
vocab2count={}
for user_id,cov in user_id2V.items():
    for k,v in cov.items():
        if 'V_' in (k):
            if k not in vocab2count:
                    vocab2count[k]=0
            if v!=0: 
                vocab2count[k]+=1
                
rare_vocabs=list(({k:v for k,v in vocab2count.items() if v<2}).keys()) #2//40 2 datasets
cov_df_V=cov_df_V.loc[:, ~ cov_df_V.columns.isin(rare_vocabs)]

user_id2V=cov_df_V.T.to_dict()
id2V_count={}
for user_id,cov in user_id2V.items():
    id2V_count[user_id]=0
    for k,v in cov.items():
        if 'V_' in (k):
            if v!=0:
                id2V_count[user_id]+=1 
                
ids= list(({k:v for k,v in id2V_count.items() if v>=2}).keys())
cov_df_V_original= cov_df_V[cov_df_V.index.isin(ids)]

index2vocab={i:cov_df_V_original.columns[i] for i in range(len(cov_df_V_original.columns))}
vocab2index={cov_df_V_original.columns[i]:i for i in range(len(cov_df_V_original.columns))}


In [35]:
cov_df = pd.read_csv('../../data/cov_mat_2023-04-28.csv', index_col=0)
cov_df_A_D = cov_df.filter(regex='^[D|A]')
user_id2cov = cov_df_A_D .T.to_dict()

actor2count = {}
for user_id, cov in user_id2cov.items():
    for k, v in cov.items():
        if 'A_' in (k):
            if k not in actor2count:
                actor2count[k] = 0
            if v != 0:
                actor2count[k] += 1

sorted_actor2count = dict(
    sorted(actor2count.items(), key=lambda item: -item[1]))

director2count = {}
for user_id, cov in user_id2cov.items():
    for k, v in cov.items():
        if 'D_' in (k):
            if k not in director2count:
                director2count[k] = 0
            if v != 0:
                director2count[k] += 1
sorted_director2count = dict(
    sorted(director2count.items(), key=lambda item: -item[1]))
directors = list(
    ({k: v for k, v in sorted_director2count.items() if v >= 2}).keys())
actors = list(({k: v for k, v in sorted_actor2count.items() if v >= 2}).keys())
cov_df_A_D = cov_df_A_D.loc[:, cov_df_A_D.columns.isin(directors+actors)]


user_id2cov = cov_df_A_D.T.to_dict()
id2director_count = {}
id2actor_count = {}
for user_id, cov in user_id2cov.items():
    id2director_count[user_id] = 0
    for k, v in cov.items():
        if 'D_' in (k):
            if v != 0:
                id2director_count[user_id] += 1

for user_id, cov in user_id2cov.items():
    id2actor_count[user_id] = 0
    for k, v in cov.items():
        if 'A_' in (k):
            if v != 0:
                id2actor_count[user_id] += 1

id2count = {k: id2director_count[k] +
            id2actor_count[k] for k in id2actor_count}
ids = list(({k: v for k, v in id2count.items() if v >= 2}).keys())
cov_df_A_D = cov_df_A_D[cov_df_A_D.index.isin(ids)]


index2actor={i:cov_df_A_D.columns[i] for i in range(len(cov_df_A_D.columns))}
actor2index={cov_df_A_D.columns[i]:i for i in range(len(cov_df_A_D.columns))}
user2index={cov_df_A_D.index[i]:i for i in range(len(cov_df_A_D.index))}
index2user= {i:cov_df_A_D.index[i] for i in range(len(cov_df_A_D.index))}

# Vertex hunting algo

In [36]:


def Projection_Find(M_orig, r,candidates):
    n = M_orig[:, 0].size
    dim = M_orig[0, :].size
    M = M_orig.copy()
    
    anchor_words = np.zeros((r, dim))
    anchor_indices = np.zeros(r, dtype=int)
    basis = np.zeros((r-1, dim))

    max_dist = 0
    for i in candidates:
    #for i in range(0, n):
        dist = np.dot(M[i], M[i])
        if dist > max_dist:
            max_dist = dist
            anchor_words[0] = M_orig[i]
            anchor_indices[0] = i

    for i in range(0, n):
        M[i] = M[i] - anchor_words[0]
        
    max_dist = 0
    
    for i in candidates:
    #for i in range(0, n):
        dist = np.dot(M[i], M[i])
        if dist > max_dist:
            max_dist = dist
            anchor_words[1] = M_orig[i]
            anchor_indices[1] = i
            basis[0] = M[i]/np.sqrt(np.dot(M[i], M[i]))
            
    for j in range(1, r - 1):
        max_dist = 0
        for i in candidates:
        #for i in range(0, n):
            M[i] = M[i] - np.dot(M[i], basis[j-1])*basis[j-1]
            dist = np.dot(M[i], M[i])
            if dist > max_dist:
                max_dist = dist
                anchor_words[j + 1] = M_orig[i]
                anchor_indices[j + 1] = i
                basis[j] = M[i]/np.sqrt(np.dot(M[i], M[i]))       
    anchor_indices_list = []
    for i in range(r):
        anchor_indices_list.append(anchor_indices[i])
    
    return (anchor_words, anchor_indices_list)

# topic SCORE

## estimate topic meaning 

In [37]:
K2A={}

def f(s):
    return s/sum(s)

for K in range(3,15):
    if K!=10:
        continue

    cov_df_V_= cov_df_A_D.T
    cov_df_V_= cov_df_V_* 5
    
    p= cov_df_V_.shape[0]
    n = cov_df_V_.shape[1]
    doc_len = cov_df_V_.sum(axis=0)
    
    cov_df_V_ = cov_df_V_.apply(f, axis=0)
    
    word_len = cov_df_V_.sum(axis=1)

    
    M_right= np.zeros((cov_df_V_.shape[1], cov_df_V_.shape[1]))
    for i in range(cov_df_V_.shape[1]):
        M_right[i,i] = 1 /doc_len[i]
    
    M_left= np.zeros((cov_df_V_.shape[0], cov_df_V_.shape[0]))
    for i in range(cov_df_V_.shape[0]):
        M_left[i,i] =  word_len[i] / n
    
    Ms = np.array([M_left[i,i] for i in range(cov_df_V_.shape[0])])
    
    tau=0.2
    quant1 = np.quantile(Ms, tau)
    quant2 = np.quantile(Ms, 1-tau)
    for i in range(cov_df_V_.shape[0]):
        if M_left[i,i]<quant1:
            M_left[i,i]=quant1
        if M_left[i,i]>quant2:
            M_left[i,i]=quant2
    
  
    

    #D = fractional_matrix_power(M_left, -0.5) @  cov_df_V_.values
    D =  fractional_matrix_power(M_left, -0.5) @  cov_df_V_.values @ fractional_matrix_power(M_right, -0.5)
    

    U, S, Vh = np.linalg.svd(D, full_matrices=False)  

    U = U[:,:K]

    R=[]
    for k in range(1,K):
        e_vec=list(U[:,k])
        e_vec=[e_vec[i]/U[:,0][i] for i in range(len(U[:,0]))]
        r  =[]
        for x in e_vec:
            r.append(x)
        R.append(r)     
    R=np.array(R).T

 
    cands = [i for i in range(cov_df_V_.shape[0])]  
    
    anchor_indices = Projection_Find(R, K,candidates=cands)
    
    V=[]
    for idx in anchor_indices[1]:
        V.append(R[idx])
    V= np.array(V)
    V = np.concatenate((np.ones(K).reshape(1,K), V.T), axis=0) 

    Pi=[]
    for j in range(cov_df_V_.shape[0]):
        pi = np.linalg.solve(V, np.array([1]+ list(R[j])))
        new_pi=[]
        for x in pi:
            if x>0:
                new_pi.append(x)
            else:
                new_pi.append(0)
        new_pi = new_pi/sum(new_pi)
        Pi.append(list(new_pi))
        
    Pi= np.array(Pi)
    D_1 = np.zeros((cov_df_V_.shape[0],cov_df_V_.shape[0]))
    for i in range(cov_df_V_.shape[0]):
        D_1[i,i] = U[:,0][i]

    A =  fractional_matrix_power( M_left, 0.5) @ D_1 @ Pi  

    A_N = A / A.sum(axis=0)
    K2A[K] = A_N.T
    print('finish '+str(K))
    

finish 10


## estimate user interest

In [38]:

D = cov_df_V_.values
user2pi={}
for i,user in index2user.items():
    d = D[:,i]
    regr = LinearRegression()
    sample_weight=[1/M_left[i,i] for i in range(len(M_left))]
    regr.fit(K2A[10].T, d, sample_weight)
    pi = list(regr.coef_)
    new_pi=[]
    for x in pi:
        if x>0:
            new_pi.append(x)
        else:
            new_pi.append(0)
    if sum(new_pi)>0:
        new_pi = new_pi/sum(new_pi)
    else:
        print(user)
    user2pi[user] = new_pi

planoto
ADmoviemtime


In [41]:
T=20
K2topic2topwords={}
for K in range(3,15):
    if K!=10:
        continue    
    K2topic2topwords[K]={} 
    topic2prob1={}
    for j in range(K):
        topic2prob1[j]={}
        K2topic2topwords[K][j]=[]
        for w in range((cov_df_V_.shape[0])):
            topic2prob1[j][index2actor[w]] = K2A[K][j,w]
       
        word2prob = {k: v for k, v in sorted(topic2prob1[j].items(), key=lambda it: -it[1])}
        c = 0
        for w1 in list(word2prob.keys()):
            K2topic2topwords[K][j].append(w1)
            c += 1
            if c == T:
                break

In [42]:
pd.DataFrame(K2topic2topwords[10])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,D_路易斯·卢米埃尔,A_神谷浩史,A_染谷将太,D_铃木清顺,A_史蒂文·元,A_艾德里安·布洛迪,A_秦昊,A_刘德华,A_佩德罗·帕斯卡,D_陈正道
1,A_马丁·斯科塞斯,D_大森贵弘,D_新海诚,A_梁朝伟,A_塔伦·埃哲顿,A_瑞安·雷诺兹,A_范伟,A_吴京,D_罗伯特·罗德里格兹,A_倪虹洁
2,A_天海祐希,D_荒木哲郎,A_深津绘里,A_张国荣,A_裴斗娜,A_安娜·德·阿玛斯,D_辛爽,A_李雪健,D_塔伊加·维迪提,A_白宇
3,A_余文乐,D_山田尚子,A_米歇尔·罗德里格兹,D_罗曼·波兰斯基,D_汤浅政明,A_克里斯·埃文斯,A_左小青,D_郭帆,D_布莱丝·达拉斯·霍华德,A_范伟
4,A_易烊千玺,A_花泽香菜,A_克里斯·派恩,A_张曼玉,A_夏帆,A_塞巴斯蒂安·斯坦,A_宋芸桦,A_米歇尔·菲佛,A_史蒂文·元,A_秦昊
5,A_雷佳音,A_樱井孝宏,A_梁朝伟,D_保罗·索伦蒂诺,A_安雅·泰勒-乔伊,D_德克斯特·弗莱彻,A_杨谨华,A_迈克尔·道格拉斯,A_艾德里安·布洛迪,D_辛爽
6,A_周一围,A_梁朝伟,A_史蒂文·元,D_洪常秀,A_水川麻美,A_米歇尔·菲佛,A_白宇,A_保罗·路德,A_安娜·德·阿玛斯,A_张婧仪
7,A_马丁·弗瑞曼,D_埃里克·侯麦,A_安雅·泰勒-乔伊,D_李沧东,A_安藤樱,A_迈克尔·道格拉斯,A_刘德华,A_伊万杰琳·莉莉,A_瑞安·雷诺兹,D_易小星
8,D_乔·赖特,A_刘德华,A_克里斯·帕拉特,D_侯孝贤,A_松坂桃李,A_保罗·路德,A_裴斗娜,A_梁朝伟,A_塞巴斯蒂安·斯坦,A_范丞丞
9,A_玛吉·史密斯,D_孔大山,A_裴斗娜,A_大卫·鲍伊,D_乔丹·皮尔,A_伊万杰琳·莉莉,A_王一博,A_艾德里安·布洛迪,A_克里斯·埃文斯,A_王鹤棣


# topic quality evaluation

In [21]:
n_dic=pd.read_csv("../n_dic_A.csv",index_col=False,header=None)
n_dic= n_dic.values.astype(int)

n_sum = n_dic.sum()
p_ij=np.zeros((len(index2actor), len(index2actor)))

for i,v1 in (index2actor).items():
    for j,v2 in index2actor.items():
        p_ij[i,j]=n_dic[i,j]/ n_sum
        
p_i= np.zeros(len(index2actor))
  
for i,v1 in (index2actor).items():
    for j,v2 in index2actor.items():
        p_i[i] += p_ij[i,j]
        
K2coh={}
for K in range(3,15):

    K2coh[K]=0
    j2t_words=K2topic2topwords[K]
    for j in range(K):
        t_words=j2t_words[j]
        coh=0
        for i1 in range(1, len(t_words)):
            for i2 in range(0, i1):
            
                val= -np.log((p_ij[actor2index[t_words[i1]],  actor2index[t_words[i2]]]+ 10**(-7))/ p_i[actor2index[t_words[i2]]] / p_i[actor2index[t_words[i1]]]) / np.log( (p_ij[actor2index[t_words[i1]],  actor2index[t_words[i2]]]+ 10**(-7)))
                coh+=val
                
        coh = coh / T /(T-1) *2
        K2coh[K]+=coh 
    K2coh[K] = K2coh[K]/K
    
def jaccard_similarity(topic_1, topic_2):
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))           
    return float(len(intersection))/float(len(union))


K2div={}
for K in range(3,15):
    div=0
    j2t_words=K2topic2topwords[K]
    for j1 in range(1,K):
        for j2 in range(0,j1):
            js= jaccard_similarity(j2t_words[j1],j2t_words[j2])
            div+=1-js
    K2div[K]=2*div/K/(K-1)
   
def flatten(l):
    return [item for sublist in l for item in sublist] 
  
K2var={}
for K in range(3,15):
    l=flatten(list(K2topic2topwords[K].values()))
    K2var[K]= len(set(l)) / (len(l)) 
   
K2Q={K:K2div[K] *K2coh[K] for K in K2coh}
K2Q

{3: 0.01941647218272936,
 4: 0.023718761889130704,
 5: 0.04156514016269489,
 6: 0.036572872988269334,
 7: 0.03737642215874303,
 8: 0.04310694362710287,
 9: 0.046625511361167056,
 10: 0.06195508811843483,
 11: 0.0772969448417718,
 12: 0.0798843016117437,
 13: 0.04910861759583992,
 14: 0.054219481652629554}