# INPUT & OUTPUT

In [56]:
##########
# INPUT  #
##########
import os

final_suggestions='/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/final_suggestions.csv'
labeled_data='/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/labeled_data.csv'

assert os.path.isfile(final_suggestions) and os.path.isfile(labeled_data)
##########
# OUTPUT #
##########
# suggestions_db_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/top2vec/suggestions_with_topics_only_rel_topics.csv'

output_df_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/top2vec/deep_learn_top2vec_topics_only_rel_topics.csv'

output_topic_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/top2vec/deep_learn_top2vec_topics_only_rel_topics.json'

##########
# CHECKS #
##########
if os.path.isfile(output_topic_file):
    print('[WARNING] output file exists, will be overwriten: "'+output_topic_file.split('/')[-1]+'"')
if os.path.isfile(output_df_file):
    print('[WARNING] output file exists, will be overwriten: "'+output_df_file.split('/')[-1]+'"')
# if os.path.isfile(suggestions_db_file):
#     print('[WARNING] output file exists, will be overwriten: "'+suggestions_db_file.split('/')[-1]+'"')



# IMPORTS

In [57]:
from top2vec import Top2Vec
import pandas as pd
import json

import sys
sys.path.append('/home/ec2-user/SageMaker/mariano/repositories/tdmstudio-high-recall-information-retrieval-system/')
from utils import tdmstudio

# CODE

In [58]:
# READING SUGGESTIONS FROM RELEVANT TOPICS AND PRONE TO BE RELEVANT TOPICS, DISCARING ALL COLUMNS EXCEPT ID AND NEW SCORE
suggestions = pd.read_csv(final_suggestions)
suggesions_from_relevant_topics = suggestions[(suggestions['topic relevance']=='R') | \
                                              (suggestions['topic relevance']=='PRONE TO BE RELEVANT')].copy()
suggesions_from_relevant_topics=suggesions_from_relevant_topics[['id','new_score']]
suggesions_from_relevant_topics.head()

Unnamed: 0,id,new_score
1275,2459693839,0.082959
1276,1411691058,0.081473
1277,2606138951,0.098472
1278,1411691065,0.083998
1279,1151729523,0.102678


In [80]:
count=0
ix_to_remove=[]
for row_ix in range(suggesions_from_relevant_topics.shape[0]):
    row = suggesions_from_relevant_topics.iloc[row_ix,:]
    if row['id'] in labeled_ids:
        index = suggesions_from_relevant_topics.index[row_ix]
        ix_to_remove.append(index)

print(f'To remove {len(ix_to_remove)} ({len(suggesions_from_relevant_topics)}-({len(ix_to_remove)}))')
suggesions_from_relevant_topics=suggesions_from_relevant_topics.drop(index=ix_to_remove)
suggesions_from_relevant_topics

To remove 69 (2779-(69))


Unnamed: 0,id,new_score
1275,2459693839,0.082959
1276,1411691058,0.081473
1277,2606138951,0.098472
1278,1411691065,0.083998
1279,1151729523,0.102678
...,...,...
7856,1282764263,0.075748
7857,1140847674,0.223604
7858,1270176318,0.124174
7859,1142429171,0.079380


In [81]:
# READING LABELED DATA, ASSIGNING NEW_SCORE=1, DISCARDING IRRELEVANT DATA
labeled = pd.read_csv(labeled_data)
labeled = labeled[labeled['label']=='R']
labeled['new_score']=1
labeled=labeled.drop(columns=['label'])

labeled_ids=set(labeled['id'])
labeled.head()

Unnamed: 0,id,new_score
0,1151636504,1
5,1151316407,1
26,1366203438,1
31,1126309571,1
33,1125764078,1


In [82]:
all_data = pd.concat([labeled,suggesions_from_relevant_topics])
all_data

Unnamed: 0,id,new_score
0,1151636504,1.000000
5,1151316407,1.000000
26,1366203438,1.000000
31,1126309571,1.000000
33,1125764078,1.000000
...,...,...
7856,1282764263,0.075748
7857,1140847674,0.223604
7858,1270176318,0.124174
7859,1142429171,0.079380


In [83]:
# ADDING TEXT TO APPLY TOP2VEC and COLUMN TO INDICATE IF RELEVANT OR SUGGESTED (TO BE USED IN OUTPUT DESCRIBING TOPICS)
all_data['text'] = list(map(lambda id_: tdmstudio.get_title_and_text(tdmstudio.get_filename(str(id_))), all_data['id']))
all_data['relevant_or_suggested'] = ['rel']*labeled.shape[0] + ['sugg']*suggesions_from_relevant_topics.shape[0]
all_data.head()

Unnamed: 0,id,new_score,text,relevant_or_suggested
0,1151636504,1.0,Not just folklore--a tool for trade.\n ...,rel
5,1151316407,1.0,Display Ad 41 -- No Title.\n \n ...,rel
26,1366203438,1.0,Sounds and the city.\n \n ...,rel
31,1126309571,1.0,Unfamiliar liberty delights Iranians.\n ...,rel
33,1125764078,1.0,Private-school debate grows.\n \n ...,rel


In [84]:
print('Training model')
deep_model = Top2Vec(documents=list(all_data['text']), speed='deep-learn', )#workers=1)


2023-03-02 20:51:42,919 - top2vec - INFO - Pre-processing documents for training


Training model


2023-03-02 20:51:48,446 - top2vec - INFO - Creating joint document/word embedding
2023-03-02 21:47:34,352 - top2vec - INFO - Creating lower dimension embedding of documents
2023-03-02 21:47:45,844 - top2vec - INFO - Finding dense areas of documents
2023-03-02 21:47:45,920 - top2vec - INFO - Finding topics


In [85]:
topic_sizes, topic_nums = deep_model.get_topic_sizes()
topic_words, word_scores, topic_nums = deep_model.get_topics()
print(topic_sizes)
print(len(topic_sizes))

[400 370 237 200 196 192 180 159 157 155 150 105  92  77  68  65]
16


In [86]:
#################
# CREATING JSON #
#################
words_data = {'word': [],
              'score': [],
              'topic': [],
             }
for topic_no in range(topic_words.shape[0]):
    for word_no in range(topic_words.shape[1]):
        words_data['topic'].append(topic_no)
        words_data['word'].append(topic_words[topic_no, word_no])
        words_data['score'].append(float(word_scores[topic_no, word_no]))
# words_df = pd.DataFrame(words_data)

# Append top2vec results to dataframe
all_data['top2vec']=deep_model.doc_top
all_data['doc_dist']=deep_model.doc_dist

document_data = {'id':list(all_data['id']),
                 'new_score':list(all_data['new_score']),
                 'top2vec':list(all_data['top2vec']),
                 'doc_dist':list(all_data['doc_dist']),
                }

combined_data = {'word data': words_data,
                 'document data': document_data}

################
# DUMPING JSON #
################
json_object = json.dumps(combined_data, indent=4)
with open(output_topic_file, "w") as outfile:
    outfile.write(json_object)

In [87]:
# data = {'topic no.': [], 
#         'size': [], 
#         'example': [], 
#         'suggestions':[], 
#         'suggestions (by relevance)': [],
#        }

# topic_sizes, topic_nums = deep_model.get_topic_sizes()
# topic_words, word_scores, topic_nums = deep_model.get_topics()

# for word_ix,_ in enumerate(topic_words[0]):
#     data[f'word_{word_ix:03}']=[]

# relevant_df['top2vec']=deep_model.doc_top
# relevant_df['doc_dist']=deep_model.doc_dist

# for ix in range(len(topic_nums)):
#     # Topic number & size 
#     data['topic no.'].append(topic_nums[ix])
#     data['size'].append(topic_sizes[ix])
    
#     # auxdf
#     auxdf=relevant_df[relevant_df['top2vec']==topic_nums[ix]]
    
#     # Example highly relevant to topic
#     data['example'].append('https://proquest.com/docview/'+str(auxdf.sort_values(by='doc_dist',ascending=False)['id'].iloc[0]))
    
#     # Suggestions IDs (sorted by topic)
#     suggestions_ids=auxdf.sort_values(by='doc_dist',ascending=False)['id']
#     data['suggestions'].append(';'.join([str(elem) for elem in suggestions_ids]))
    
#     # Suggestions IDs (by relevance)
#     #NOT USED BECAUSE WE CANNOT RE RUN THIS SCRIPT, gives different topics 
#     # and Serperi is already working with the other topics:
    
#     suggestions_ids=auxdf.sort_values(by='new_score',ascending=False)['id']
#     data['suggestions (by relevance)'].append(';'.join([str(elem) for elem in suggestions_ids]))
    
#     # Labelled as relevant IDs
# #     relevant_ids=auxdf[auxdf['relevant_or_suggested']=='rel'].sort_values(by='doc_dist',ascending=False)['id']
# #     data['relevants'].append(';'.join([str(elem) for elem in relevant_ids]))
    
    
#     # Filter auxdf to have only relevant (not suggestions)
# #     auxdf=auxdf[auxdf['relevant_or_suggested']=='rel']
    
#     # Relevant example highly relevant to topic.
# #     if auxdf.shape[0]>0:
# #         data['example(rel)'].append('https://proquest.com/docview/'+str(auxdf.sort_values(by='doc_dist',ascending=False)['id'].iloc[0]))
# #     else:
# #         data['example(rel)'].append('')
        
#     # No. of relevant in topic
# #     data['relevant count'].appendt(auxdf.shape[0])
    
#     # Words in topic
#     for word_ix,word in enumerate(topic_words[ix]):
#         data[f'word_{word_ix:03}'].append(word)



## EVERYTHING ABOVE IS OLD
data = {'topic no.': [], 
        'size': [], 
        'relevant count': [], 
        'example': [], 
        'example(rel)':[], 
        'suggestions':[], 
        'relevants':[],
        'all ids':[],
        'suggestions (by relevance)': [],
       }
topic_sizes, topic_nums = deep_model.get_topic_sizes()
topic_words, word_scores, topic_nums = deep_model.get_topics()
for word_ix,_ in enumerate(topic_words[0]):
    data[f'word_{word_ix:03}']=[]

all_data['top2vec']=deep_model.doc_top
all_data['doc_dist']=deep_model.doc_dist

for ix in range(len(topic_nums)):
    # Topic number & size 
    data['topic no.'].append(topic_nums[ix])
    data['size'].append(topic_sizes[ix])
    
    # auxdf
    auxdf=all_data[all_data['top2vec']==topic_nums[ix]]
    
    # Example highly relevant to topic
    data['example'].append('https://proquest.com/docview/'+str(auxdf.sort_values(by='doc_dist',ascending=False)['id'].iloc[0]))
    
    # Suggestions IDs (sorted by topic)
    suggestions_ids=auxdf[auxdf['relevant_or_suggested']=='sugg'].sort_values(by='doc_dist',ascending=False)['id']
    data['suggestions'].append(';'.join([str(elem) for elem in suggestions_ids]))
    
    # ALL IDs (sorted by topic)
    all_ids=auxdf.sort_values(by='doc_dist',ascending=False)['id']
    data['all ids'].append(';'.join([str(elem) for elem in all_ids]))
    
    # Suggestions IDs (by relevance)
    #NOT USED BECAUSE WE CANNOT RE RUN THIS SCRIPT, gives different topics 
    # and Serperi is already working with the other topics:
    # FOR THIS EXPERIMENTS WE CAN DO IT, WE COULDN'T FOR THE PREVIOUS ONE. THE ABOVE COMMENT IS COPY PASTED FROM OTHER
    # JUPYTER NOTEBOOK
    
    suggestions_ids=auxdf[auxdf['relevant_or_suggested']=='sugg'].sort_values(by='new_score',ascending=False)['id']
    data['suggestions (by relevance)'].append(';'.join([str(elem) for elem in suggestions_ids]))
    
    # Labelled as relevant IDs
    relevant_ids=auxdf[auxdf['relevant_or_suggested']=='rel'].sort_values(by='doc_dist',ascending=False)['id']
    data['relevants'].append(';'.join([str(elem) for elem in relevant_ids]))
    
    
    # Filter auxdf to have only relevant (not suggestions)
    auxdf=auxdf[auxdf['relevant_or_suggested']=='rel']
    
    # Relevant example highly relevant to topic.
    if auxdf.shape[0]>0:
        data['example(rel)'].append('https://proquest.com/docview/'+str(auxdf.sort_values(by='doc_dist',ascending=False)['id'].iloc[0]))
    else:
        data['example(rel)'].append('')
        
    # No. of relevant in topic
    data['relevant count'].append(auxdf.shape[0])
    
    # Words in topic
    for word_ix,word in enumerate(topic_words[ix]):
        data[f'word_{word_ix:03}'].append(word)

output_df = pd.DataFrame(data)
output_df.to_csv(output_df_file)
output_df

Unnamed: 0,topic no.,size,relevant count,example,example(rel),suggestions,relevants,all ids,suggestions (by relevance),word_000,...,word_040,word_041,word_042,word_043,word_044,word_045,word_046,word_047,word_048,word_049
0,0,400,6,https://proquest.com/docview/1143385056,https://proquest.com/docview/1357031988,1143385056;1125055611;1151428436;1125566253;11...,1357031988;1125706241;1147042690;1151358658;12...,1143385056;1125055611;1151428436;1125566253;11...,1444824582;1151481475;1351093244;1222288596;13...,tv,...,audi,sta,station,shows,licence,hollywood,actress,documentaries,drama,comic
1,1,370,6,https://proquest.com/docview/1142439313,https://proquest.com/docview/1371108106,1142439313;1143881816;1148136086;1237203459;12...,1371108106;1143947290;1411791122;1143898495;26...,1142439313;1143881816;1148136086;1237203459;12...,1411867739;1356699776;1282786298;1124990937;11...,investor,...,temporary,appli,arrivals,refugees,skills,provinces,newcomers,applications,grants,wage
2,2,237,16,https://proquest.com/docview/1323537660,https://proquest.com/docview/1145627214,1323537660;1148423270;1144117951;1141272360;11...,1145627214;1148596840;1143664837;1143312937;11...,1323537660;1148423270;1144117951;1141272360;11...,1147616449;1434945595;1143806798;1143594520;11...,bissoondath,...,editor,mosaic,preservation,relative,equality,societies,identity,bay,strange,charter
3,3,200,4,https://proquest.com/docview/1151120020,https://proquest.com/docview/2122116827,1151120020;1444842268;1316674285;1145780127;14...,2122116827;1141113423;1125527397;1237268511,1151120020;1444842268;1316674285;1145780127;14...,1237671607;1238248945;1356704961;1284751484;11...,my,...,mother,instruction,knowing,acceptable,learning,children,ask,lunch,find,ita
4,4,196,3,https://proquest.com/docview/1143707847,https://proquest.com/docview/1125764078,1143707847;1138432056;1434978441;1400745146;14...,1125764078;1125547053;1125621517,1143707847;1138432056;1434978441;1400745146;14...,1238681375;1412271450;1151314133;1400799649;14...,schools,...,trustees,mcguinty,extending,lic,ruling,charter,elementary,progressive,arguments,christian
5,5,192,0,https://proquest.com/docview/2273309585,,2273309585;1688094473;1411691058;1151729523;24...,,2273309585;1688094473;1411691058;1151729523;24...,1371259091;1144920213;1238744788;1140816413;13...,museum,...,photographs,shape,histo,tells,glass,paintings,evolution,artist,humour,showcase
6,6,180,3,https://proquest.com/docview/1270520819,https://proquest.com/docview/1237243181,1270520819;1239854764;1240533247;1270403957;12...,1237243181;1142543273;1151515259,1270520819;1239854764;1240533247;1270403957;12...,1412732285;1222592501;1239614145;1435670971;11...,classes,...,broadcasting,guage,report,tv,crtc,educators,grade,guages,lan,minutes
7,7,159,19,https://proquest.com/docview/1371166946,https://proquest.com/docview/1237388871,1371166946;1412619043;1151491863;1237519914;14...,1237388871;1291411855;1143909344;1143812467;12...,1371166946;1237388871;1412619043;1151491863;12...,1238402850;1237591900;1291413041;1237192311;11...,conservatives,...,coalition,ethnic,brian,campaign,lism,museum,candidates,seats,ukrainian,voting
8,8,157,2,https://proquest.com/docview/1136767364,https://proquest.com/docview/2459966061,1136767364;1444885140;1688087794;1444719298;14...,2459966061;1366203438,1136767364;1444885140;1688087794;1444719298;14...,1151435666;1237716062;1140749850;1140863514;21...,architecture,...,dependent,pm,pilot,subjects,shape,bay,join,istanbul,arts,creative
9,9,155,4,https://proquest.com/docview/1270363090,https://proquest.com/docview/1126309571,1270363090;1356790323;1126306428;1366149359;16...,1126309571;2121464503;1138386786;1143705076,1270363090;1356790323;1126306428;1366149359;11...,1366179544;1366106590;1237557545;1356807524;13...,daughter,...,teacher,lunch,woman,enrolled,pupils,shy,younger,boy,mothers,ents


In [None]:
de