In [1]:
################
# INPUT FILES  #
################
labeled_datafile = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/labeled_data.csv'
unlabeled_datafile = '/home/ec2-user/SageMaker/serperi/system/sessions/scal/One_second_round/data/exported_data_2022-11-23_00-18.csv'

################
# OUTPUT FILES #
################
new_ranking_file = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/new_scores.csv'


In [2]:
import sys
sys.path.append('/home/ec2-user/SageMaker/mariano/repositories/tdmstudio-high-recall-information-retrieval-system/')
from utils import io
from utils import tdmstudio

import re
import pandas as pd


######################################
# READING LABELED AND UNLABELED DATA #
######################################


# Read labeled data
labeled_df = pd.read_csv(labeled_datafile)

# Change ID to str type
labeled_df['id']=[str(id_) for id_ in labeled_df['id']]

# Everything has to be either Relevant or Irrelevant (not unknown)
assert all([label=='R' or label=='I' for label in labeled_df['label']])

# Reading unlabeled (to make predictions)
unlabeled_df = pd.read_csv(unlabeled_datafile)

#Building id from URL 
unlabeled_df['id'] = [re.sub('https://proquest.com/docview/','' ,url ) for url in unlabeled_df['URL']]


io.info(f'labeled_df.shape=       {labeled_df.shape}')
io.info(f'unlabeled_df.shape=     {unlabeled_df.shape}')
print()

# FILTERING, ONLY KEEPING SUGGESTIONS, NOT LABELED #
io.info(f'Removing labeled from suggestions (unlabeled) ...')
unlabeled_df=unlabeled_df[unlabeled_df['relevant_or_suggested']=='sugg']
io.info(f'new unlabeled_df.shape= {unlabeled_df.shape}')

# REMOVING LABELED DURING EVALUATION #
io.info(f'Removing labeled from unlabeled ...')
labeled_ids=set(labeled_df['id'])
unlabeled_df = unlabeled_df[[id_ not in labeled_ids for id_ in unlabeled_df['id']]]
io.info(f'new unlabeled_df.shape= {unlabeled_df.shape}')

########################################
# READING TITLE AND TEXT FOR ALL ITEMS #
########################################
print()
io.info('Appending texts ...')
labeled_df['text']=[tdmstudio.get_title_and_text(tdmstudio.get_filename(id_)) for id_ in labeled_df['id']]
unlabeled_df['text']=[tdmstudio.get_title_and_text(tdmstudio.get_filename(id_)) for id_ in unlabeled_df['id']]
io.ok('Done')


2022-12-08 16:24:37.265471 [ [1;94mINFO[0m  ] labeled_df.shape=       (737, 2)
2022-12-08 16:24:37.266012 [ [1;94mINFO[0m  ] unlabeled_df.shape=     (8176, 4)

2022-12-08 16:24:37.266171 [ [1;94mINFO[0m  ] Removing labeled from suggestions (unlabeled) ...
2022-12-08 16:24:37.268820 [ [1;94mINFO[0m  ] new unlabeled_df.shape= (8115, 4)
2022-12-08 16:24:37.268935 [ [1;94mINFO[0m  ] Removing labeled from unlabeled ...
2022-12-08 16:24:37.272126 [ [1;94mINFO[0m  ] new unlabeled_df.shape= (8008, 4)

2022-12-08 16:24:37.272316 [ [1;94mINFO[0m  ] Appending texts ...
2022-12-08 16:24:46.970628 [  [1;92mOK[0m   ] Done


In [3]:
##################
# CREATING MODEL #
##################
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from utils import nlp_auxiliary
import spacy

nlp = spacy.load('en_core_web_sm')
stopwords = {stopword for stopword in nlp.Defaults.stop_words if stopword==nlp_auxiliary.preprocessor(stopword)}
vectorizer = TfidfVectorizer(lowercase=True,
                             preprocessor=nlp_auxiliary.preprocessor,
                             stop_words=stopwords,
                             ngram_range=(1,3),
                             max_features=10000,
                             use_idf=True,                             
                             smooth_idf=True,                             
                            )

io.info('Creating labeled articles representation ...')
X = vectorizer.fit_transform(labeled_df['text'])
y = np.array([1 if label=='R' else 0 for label in labeled_df['label']])

io.info('Training model ...')
model = LogisticRegression()
model.fit(X,y)


io.info('Making predictions ...')
yhat = model.predict_proba(vectorizer.transform(unlabeled_df['text']))
unlabeled_df['new_score']=yhat[:,1]

##################################
# Adding title and removing text #
##################################
unlabeled_df['title'] = [tdmstudio.get_title(tdmstudio.get_filename(id_)) for id_ in unlabeled_df['id']]
unlabeled_df= unlabeled_df.drop(columns=['text'])
# unlabeled_df.to_csv(new_ranking_file, index=False)


# Concatenating labeled to suggestions
relevant_labeled_df = labeled_df[labeled_df['label']=='R'].copy()
io.info(f'Relevant labeled= {relevant_labeled_df.shape[0]}')
relevant_labeled_df['URL']=[f'https://proquest.com/docview/{id_}' for id_ in relevant_labeled_df['id']]
relevant_labeled_df['relevant_or_suggested']='rel'
relevant_labeled_df['relevant_or_suggested']='rel'
relevant_labeled_df['confidence']=1.0
relevant_labeled_df['new_score']=1.0
relevant_labeled_df['title']=[tdmstudio.get_title(tdmstudio.get_filename(id_)) for id_ in relevant_labeled_df['id']]
relevant_labeled_df= relevant_labeled_df.drop(columns=['text'])

relevant_labeled_df = relevant_labeled_df[['URL','relevant_or_suggested', 'confidence', 'id', 'new_score','title']]
concatenated = relevant_labeled_df.append(unlabeled_df)


# DUMP TO DISK #
concatenated.to_csv(new_ranking_file, index=False)

io.ok('Done')

2022-12-08 15:42:30.367488 [ [1;94mINFO[0m  ] Creating labeled articles representation ...
2022-12-08 15:43:18.630495 [ [1;94mINFO[0m  ] Training model ...
2022-12-08 15:43:18.725274 [ [1;94mINFO[0m  ] Making predictions ...
2022-12-08 15:50:28.521705 [  [1;92mOK[0m   ] Done
