In [1]:
################
# INPUT FILES  #
################
labeled_datafile = '/home/ec2-user/SageMaker/mariano/datasets/multiculturalism/files/labeled_data.csv'
unlabeled_datafile = '/home/ec2-user/SageMaker/serperi/system/sessions/scal/One_second_round/data/exported_data_2022-11-23_00-18.csv'

################
# OUTPUT FILES #
################


In [2]:
import sys
sys.path.append('/home/ec2-user/SageMaker/mariano/repositories/tdmstudio-high-recall-information-retrieval-system/')
from utils import io
from utils import tdmstudio

import re
import pandas as pd

In [4]:
######################################
# READING LABELED AND UNLABELED DATA #
######################################

# Read labeled data
labeled_df = pd.read_csv(labeled_datafile)
labeled_df
# Change ID to str type
labeled_df['id']=[str(id_) for id_ in labeled_df['id']]

# Everything has to be either Relevant or Irrelevant (not unknown)
assert all([label=='R' or label=='I' for label in labeled_df['label']])

io.info('Appending texts ...')
labeled_df['text']=[tdmstudio.get_title_and_text(tdmstudio.get_filename(id_)) for id_ in labeled_df['id']]
io.ok('Done')

labeled_df.head()

2023-02-24 13:55:38.353905 [ [1;94mINFO[0m  ] Appending texts ...
2023-02-24 13:56:04.052687 [  [1;92mOK[0m   ] Done


Unnamed: 0,id,label,text
0,1151636504,R,Not just folklore--a tool for trade.\n ...
1,1237806961,I,Multiculturalism fine in theory.\n \n...
2,1269975477,I,CRTC deflates promoters of multicultural TV ch...
3,1143676333,I,The myth of Canada as cultural mosaic.\n ...
4,1143862565,I,Display Ad 19 -- No Title.\n \n ...


In [None]:






# # Reading unlabeled (to make predictions)
# unlabeled_df = pd.read_csv(unlabeled_datafile)

# #Building id from URL 
# unlabeled_df['id'] = [re.sub('https://proquest.com/docview/','' ,url ) for url in unlabeled_df['URL']]


# io.info(f'labeled_df.shape=       {labeled_df.shape}')
# io.info(f'unlabeled_df.shape=     {unlabeled_df.shape}')
# print()

# # FILTERING, ONLY KEEPING SUGGESTIONS, NOT LABELED #
# io.info(f'Removing labeled from suggestions (unlabeled) ...')
# unlabeled_df=unlabeled_df[unlabeled_df['relevant_or_suggested']=='sugg']
# io.info(f'new unlabeled_df.shape= {unlabeled_df.shape}')

# # REMOVING LABELED DURING EVALUATION #
# io.info(f'Removing labeled from unlabeled ...')
# labeled_ids=set(labeled_df['id'])
# unlabeled_df = unlabeled_df[[id_ not in labeled_ids for id_ in unlabeled_df['id']]]
# io.info(f'new unlabeled_df.shape= {unlabeled_df.shape}')

# ########################################
# # READING TITLE AND TEXT FOR ALL ITEMS #
# ########################################
# print()

# unlabeled_df['text']=[tdmstudio.get_title_and_text(tdmstudio.get_filename(id_)) for id_ in unlabeled_df['id']]


In [5]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from utils import nlp_auxiliary
import spacy

nlp = spacy.load('en_core_web_sm')
stopwords = {stopword for stopword in nlp.Defaults.stop_words if stopword==nlp_auxiliary.preprocessor(stopword)}
vectorizer = TfidfVectorizer(lowercase=True,
                             preprocessor=nlp_auxiliary.preprocessor,
                             stop_words=stopwords,
                             ngram_range=(1,3),
                             max_features=10000,
                             use_idf=True,                             
                             smooth_idf=True,                             
                            )

io.info('Creating labeled articles representation ...')
X = vectorizer.fit_transform(labeled_df['text'])
y = np.array([1 if label=='R' else 0 for label in labeled_df['label']])

io.info('Training model ...')
model = LogisticRegression(keep_pr)
model.fit(X,y)

2023-02-24 13:59:15.677404 [ [1;94mINFO[0m  ] Creating labeled articles representation ...
2023-02-24 14:00:14.817065 [ [1;94mINFO[0m  ] Training model ...


LogisticRegression()

In [32]:
for file_ in ['8k.csv', '17k.csv']:
    unlabeled = pd.read_csv(file_,header=None)
    print(f'Working with file={file_} (shape={unlabeled.shape})')
    unlabeled.columns=['id']
    print('Adding text ...')
    unlabeled['text']=[tdmstudio.get_title_and_text(tdmstudio.get_filename(str(id_))) for id_ in unlabeled['id']]
    unlabeled.head()

    print('Building X')
    X = vectorizer.transform(unlabeled['text'])
    print('prediction')
    yhat = model.predict_proba(X)
    unlabeled['yhat']=yhat[:,1]
    output_file = file_.split('.')[0]+'_with_predictions'+'.csv'
    
    print('Droping text ...')
    unlabeled=unlabeled.drop(columns=['text'])    
    print(f'Output to {output_file}')
    
    unlabeled.to_csv(output_file)
    print('---')

Working with file=17k.csv (shape=(17014, 1))
Adding text ...
Building X
prediction
Droping text ...
Output to 17k_with_predictions.csv
---


In [29]:
unlabeled.shape

(8115, 2)

In [30]:
output_file

'17k_with_predictions.csv'

In [16]:
file_='17k.csv'
output_file = file_.split('.')[0]+'_with_predictions'+'.csv'
output_file

'17k_with_predictions.csv'