# INPUT & OUTPUT

In [148]:
##########
# INPUT  #
##########
topic_relevance_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/top2vec/topic_relevance.csv'
topics_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/top2vec/deep_learn_top2vec_topics.csv'
scores_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/new_scores.csv'


##########
# OUTPUT #
##########
final_suggestions = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/final_suggestions.csv'

# IMPORTS

In [150]:
import pandas as pd
import numpy as np

import sys
sys.path.append('/home/ec2-user/SageMaker/mariano/repositories/tdmstudio-high-recall-information-retrieval-system/')

from utils import tdmstudio

# CODE

In [101]:
###################
# RETRIEVE TOPICS #
###################
topics_df = pd.read_csv(topics_file,
                        index_col=0,
                       )
print(f'Retrieving  topics ...')
print(f'Number of topics:  {topics_df.shape[0]}')
print(f"Columns found:     {', '.join(topics_df.columns)[:90]} ...")
      
############################
# RETRIEVE TOPIC RELEVANCE #
############################
topic_relevance_df =pd.read_csv(topic_relevance_file,
                                sep=';',
                               )
topic_relevance_df=topic_relevance_df.drop(columns='topic no.')
topic_relevance_df['has_labeled_as_relevant'] = [type(elem)!=float for elem in df['relevants'] ]

print()
print(f'Retrieving  topic relevance ...')
print(f'Number of topics:  {topic_relevance_df.shape[0]}')
print(f"Columns found:     {', '.join(topic_relevance_df.columns)}")
##########
# CONCAT #
##########
df = pd.concat([topics_df, topic_relevance_df], axis=1)
df.head()

Retrieving  topics ...
Number of topics:  55
Columns found:     topic no., size, relevant count, example, example(rel), suggestions, relevants, word_000,  ...

Retrieving  topic relevance ...
Number of topics:  55
Columns found:     topic relevance, with_relevant, has_labeled_as_relevant


Unnamed: 0,topic no.,size,relevant count,example,example(rel),suggestions,relevants,word_000,word_001,word_002,...,word_043,word_044,word_045,word_046,word_047,word_048,word_049,topic relevance,with_relevant,has_labeled_as_relevant
0,0,601,2,https://proquest.com/docview/1366203412,https://proquest.com/docview/2459966061,1366203412;1412658808;1371240574;1139223026;26...,2459966061;2121464503,architecture,architectural,design,...,condo,alsop,developers,houses,downtown,interior,manhattan,IR,False,True
1,1,346,1,https://proquest.com/docview/1688066431,https://proquest.com/docview/1151128247,1688066431;1811378946;1238603351;1151514955;11...,1151128247,inuit,cree,aboriginal,...,history,claims,missionaries,mohawk,nunavut,labrador,settlements,IR,False,True
2,2,328,1,https://proquest.com/docview/1151354973,https://proquest.com/docview/1400724651,1151354973;1147063763;1238483526;1237286806;11...,1400724651,meech,accord,constitutional,...,text,consensus,unanimity,inces,federalism,char,entrenched,IR,False,True
3,3,285,2,https://proquest.com/docview/2459693839,https://proquest.com/docview/1366203438,2459693839;1411691058;2606138951;1411691065;11...,1366203438;1237575431,museum,artifacts,gallery,...,crafts,celebration,chest,chalmers,ancient,lery,archives,PRONE TO BE RELEVANT,True,True
4,4,279,1,https://proquest.com/docview/1368277382,https://proquest.com/docview/1237728590,1368277382;1371142585;2606154591;1434946157;11...,1237728590,film,films,movie,...,kunuk,tale,shot,produc,festivals,series,comic,PRONE TO BE RELEVANT,True,True


In [102]:
relevant_topic =( df['topic relevance']=='R') | ( df['topic relevance']=='PRONE TO BE RELEVANT')
# irrelevant_topic = df['topic relevance']=='IR'
with_relevant = df['with_relevant'] | df['has_labeled_as_relevant']

print(f'Number of relevant topics                     {np.sum(relevant_topic)}')
print(f'Number of irrelevant topics                   {np.sum(~relevant_topic)}')

print(f'Number of irrelevant topics with relevant:    {np.sum(~relevant_topic & with_relevant)}')
print(f'Number of irrelevant topics without relevant: {np.sum(~relevant_topic & ~with_relevant)}')
# CORRECT OUTPUT:
# Number of relevant topics                     18
# Number of irrelevant topics                   37
# Number of irrelevant topics with relevant:    20
# Number of irrelevant topics without relevant: 17

Number of relevant topics                     18
Number of irrelevant topics                   37
Number of irrelevant topics with relevant:    20
Number of irrelevant topics without relevant: 17


In [140]:
scores_df = pd.read_csv(scores_file)

id2score = {row['id']:row['new_score']  for _, row in scores_df[['id','new_score']].iterrows()}

scores_df.head()


Unnamed: 0,URL,relevant_or_suggested,confidence,id,new_score,title
0,https://proquest.com/docview/1151636504,rel,1.0,1151636504,1.0,Not just folklore--a tool for trade.
1,https://proquest.com/docview/1151316407,rel,1.0,1151316407,1.0,Display Ad 41 -- No Title.
2,https://proquest.com/docview/1366203438,rel,1.0,1366203438,1.0,Sounds and the city.
3,https://proquest.com/docview/1126309571,rel,1.0,1126309571,1.0,Unfamiliar liberty delights Iranians.
4,https://proquest.com/docview/1125764078,rel,1.0,1125764078,1.0,Private-school debate grows.


In [152]:
data = {'id':[],
        'title':[],
        'topic relevance': [],
        'with relevant': [],
        'has_labeled_as_relevant':[],
        'is_labeled_relevant':[],
        'new_score': [],
       }

for topic_id in range(df.shape[0]):
    row = df.iloc[topic_id, :]
    topic_relevance=row['topic relevance']
    with_relevant=row['with_relevant']
    has_labeled_as_relevant=row['has_labeled_as_relevant']
    
    suggestions = row['suggestions'].split(';')
    relevants = row['relevants'].split(';')  if row['has_labeled_as_relevant'] else []
    
    assert all([int(id_) in id2score for id_ in suggestions+relevants])
    for id_, is_labeled_relevant in zip(suggestions+relevants, [False]*len(suggestions) + [True]*len(relevants)):
        data['id'].append(id_)
        data['new_score'].append(id2score[int(id_)])
        data['title'].append(tdmstudio.get_title(tdmstudio.get_filename(id_)))
        data['topic relevance'].append(topic_relevance)
        data['with relevant'].append(with_relevant)
        data['has_labeled_as_relevant'].append(has_labeled_as_relevant)
        data['is_labeled_relevant'].append(is_labeled_relevant)
   

pd.DataFrame(data).to_csv(final_suggestions)
pd.DataFrame(data)



Unnamed: 0,id,title,topic relevance,with relevant,has_labeled_as_relevant,is_labeled_relevant,new_score
0,1366203412,The birth of Toronto Style.,IR,False,True,False,0.100637
1,1412658808,ARCHITECTURE.,IR,False,True,False,0.076864
2,1371240574,STYLE SCUFFLE.,IR,False,True,False,0.077903
3,1139223026,Calatrava's sculptural architecture.,IR,False,True,False,0.079527
4,2606112512,Let's face it: Modernist architecture is history.,IR,False,True,False,0.084625
...,...,...,...,...,...,...,...
8096,1513507531,A community that enriches the Canadian perspec...,IR,False,False,False,0.173145
8097,1125553227,The hate stops here.,IR,False,False,False,0.118312
8098,1434944173,Letter to the Editor 1 -- No Title.,IR,False,False,False,0.151057
8099,1411679365,Canada's immigrant challenge.,IR,False,False,False,0.113697
