In [1]:
from utils.general import info, ok, warning

### Label Data

In [2]:
import os
import re
from utils.general import id2file

second_round_data = [linea.strip().split(';') for linea in  open('second_roud_labels.csv','r').read().splitlines()]
irrelevant_set = set([id_ for id_,label in second_round_data if label=='I'])
relevant_set = set([id_ for id_,label in second_round_data if label=='R'])


# Loading new_data
new_data = [line.split(';') for line in open('new_data.csv').read().splitlines()]
relevant_set = relevant_set.union(set([id_ for id_,label in new_data if label.strip()=='R']))
irrelevant_set = irrelevant_set.union(set([id_ for id_,label in new_data if label.strip()=='I']))

# Loading original data
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'

first_data = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        content = open(os.path.join(dirpath,filename),'r').read()
        ids = re.findall('/docview/([^/]*)/',content)
        relevant_set = relevant_set.union(set(ids))
    
# articles containg DP and Canada from that period, that were not deteted by Serperi
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

files = os.listdir(GM_dp_dirpath)

irrelevant_set = irrelevant_set.union([file_[:-4] for file_ in files if file_[:-4] not in relevant_set and file_.endswith('.xml')])

not_found=[]
for id_ in list(relevant_set)+list(irrelevant_set):
    if id2file(id_) is None:
        not_found.append(id_)
print(f'Not found: {not_found}')
for id_ in not_found:
    relevant_set = relevant_set.difference(set(not_found))
    irrelevant_set = irrelevant_set.difference(set(not_found))
    
info(f'len(relevant_set)   = {len(relevant_set)}')
info(f'len(irrelevant_set) = {len(irrelevant_set)}')


Not found: ['1411697642', '1143160388', '1238204962', '2459964104', '2459666609', '1222379804', '1242257052', '1239753620', '1151348424', '2122281371', '1238440920', '1136691129', '2122279956']
2022-03-17 17:21:36.735657 [ [1;94mINFO[0m  ] len(relevant_set)   = 581
2022-03-17 17:21:36.735831 [ [1;94mINFO[0m  ] len(irrelevant_set) = 6523


In [3]:
import numpy as np
y = np.zeros(shape=(len(relevant_set)+len(irrelevant_set),))
y[:len(relevant_set)]=1
y

array([1., 1., 1., ..., 0., 0., 0.])

### Models and Data

In [4]:
from sklearn.svm import SVC
from utils.models import glove300_vectorize, glove600_vectorize
from utils.tdmstudio import TDMStudio

titles, texts = [], []
for id_ in list(relevant_set)+list(irrelevant_set):
    title, text = TDMStudio.get_title_and_text(id2file(id_))
    titles.append(title)
    texts.append(text)
    


In [5]:
%%time
data = []
data.append(glove300_vectorize(titles,texts))

CPU times: user 30min 1s, sys: 3min 55s, total: 33min 57s
Wall time: 34min


In [7]:

data.append(glove600_vectorize(titles,texts))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.models import tokenize
from utils.tdmstudio import TDMStudio
import spacy
import string
nlp = spacy.load('en_core_web_sm', disable=['textcat', 'parser','ner'])

stopwords = nlp.Defaults.stop_words
invalid = set([sw for sw in stopwords if any([token for token in tokenize(sw) if not token in stopwords ])]) # ['‘ve', "'m", '’ve', "'ve", '’m', '‘m', '‘d', '‘ll']
stopwords = set(stopwords.difference(invalid)) 
vectorizer = TfidfVectorizer(
                             input='content',
                             lowercase=True,
                             preprocessor=None,
                             tokenizer=tokenize,
                             analyzer='word',
                             stop_words=list(stopwords),
                             token_pattern=r"(?u)\b\w\w+\b", #selects tokens of 2 or more alphanumeric char (punctuation is completely ignored and treated as token separator)
                                                             # UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None
    
                             ngram_range=(1,1), #lower and upper boundary of the range of n-values for different n-grams.
                             max_df=1.0, #ignore terms that have a document frequency strictly higher than given threshold
                             min_df=0.001, #ignore terms that have a document frequency strictly lower than given threshold
                             max_features=50000, #build a vocabulary that only considers the top max_features ordered by term frequency acoss the corpus
                             vocabulary=None, #vocabulary is determined from the input documents
                             norm='l2',
                             use_idf=False,
                             )

def remove_punctuation(word):
    return ''.join([char for char in word if not char in string.punctuation+' '])

def tokenize(str_):
    tokens = [word.lemma_.lower() for word in nlp(str_) if not word.is_stop]
    tokens = [word.replace('\n', '') for word in tokens if not word.isnumeric() and len(remove_punctuation(word))!=0]
    return tokens

data.append(vectorizer.fit_transform([f'{title}. {text}' for title, text in zip(titles,texts)]))

In [9]:
len(data)

3

In [20]:
models = [
          SVC(C=7, kernel='rbf', probability=True),             # GloVe300
          SVC(C=1, degree=3, kernel='poly', probability=True),  # GloVe600
          SVC(C=1, kernel='linear', probability=True),          # BOW + TF
         ]


In [21]:
%%time
for X,model in zip(data,models):
    model.fit(X,y)

CPU times: user 3min 8s, sys: 178 ms, total: 3min 8s
Wall time: 3min 8s


### Applying to unlabeled data

In [6]:
from utils.general import info,ok,warning,id2file
import os
predictions_dirpath = './predictions/'

possible_relevant = [id2file(f[:-5]) for f in os.listdir(predictions_dirpath) if f.endswith('_v3.p')]
info(f'Number of possible relevant articles: {len(possible_relevant):,}')
assert all([not elem is None for elem in possible_relevant])
assert all([type(elem)==str for elem in possible_relevant])
possible_relevant[0]

2022-03-17 12:42:27.356061 [ [1;94mINFO[0m  ] Number of possible relevant articles: 4,973


'/home/ec2-user/SageMaker/data/GM_all_1945_1956/1323603426.xml'

In [57]:
from utils.models import best_paragraph

import os

best_paragraph(
               '/home/ec2-user/SageMaker/data/GM_all_1945_1956/1294154032.xml',
               [vectorizer.transform ,_glove300_vectorize],
               [models[2], models[0]]
              )

NameError: name 'os' is not defined

In [50]:
[f for f in os.listdir('predictions') if f.endswith('_v3.p')][3]

'1294154032_v3.p'

In [51]:
id2file('1294154032')

'/home/ec2-user/SageMaker/data/GM_all_1945_1956/1294154032.xml'