In [2]:
from utils.general import info, ok, warning, id2file

### Label Data

In [3]:
import os
import re
relevant_set = set()
irrelevant_set = set()

# Loading new_data
new_data = [line.split(';') for line in open('new_data.csv').read().splitlines()]
relevant_set = relevant_set.union(set([id_ for id_,label in new_data if label.strip()=='R']))
irrelevant_set = irrelevant_set.union(set([id_ for id_,label in new_data if label.strip()=='I']))

# Loading original data
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'

first_data = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        content = open(os.path.join(dirpath,filename),'r').read()
        ids = re.findall('/docview/([^/]*)/',content)
        relevant_set = relevant_set.union(set(ids))
    
# articles containg DP and Canada from that period, that were not deteted by Serperi
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

files = os.listdir(GM_dp_dirpath)

irrelevant_set = irrelevant_set.union([file_[:-4] for file_ in files if file_[:-4] not in relevant_set and file_.endswith('.xml')])

not_found=[]
for id_ in list(relevant_set)+list(irrelevant_set):
    if id2file(id_) is None:
        not_found.append(id_)
print(f'Not found: {not_found}')
for id_ in not_found:
    relevant_set = relevant_set.difference(set(not_found))
    irrelevant_set = irrelevant_set.difference(set(not_found))
    
info(f'len(relevant_set)   = {len(relevant_set)}')
info(f'len(irrelevant_set) = {len(irrelevant_set)}')


Not found: ['1222379804', '1143160388', '1238440920', '1151348424', '2459964104', '1136691129', '1242257052', '1411697642', '2459666609', '2122281371', '1239753620', '2122279956', '1238204962']
2022-03-14 16:25:58.469924 [ [1;94mINFO[0m  ] len(relevant_set)   = 542
2022-03-14 16:25:58.470090 [ [1;94mINFO[0m  ] len(irrelevant_set) = 6478


In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.models import tokenize
from utils.tdmstudio import TDMStudio
import spacy
import string
nlp = spacy.load('en_core_web_sm', disable=['textcat', 'parser','ner'])

stopwords = nlp.Defaults.stop_words
invalid = set([sw for sw in stopwords if any([token for token in tokenize(sw) if not token in stopwords ])]) # ['‘ve', "'m", '’ve', "'ve", '’m', '‘m', '‘d', '‘ll']
stopwords = set(stopwords.difference(invalid)) 
vectorizer = TfidfVectorizer(
                             input='content',
                             lowercase=True,
                             preprocessor=None,
                             tokenizer=tokenize,
                             analyzer='word',
                             stop_words=list(stopwords),
                             token_pattern=r"(?u)\b\w\w+\b", #selects tokens of 2 or more alphanumeric char (punctuation is completely ignored and treated as token separator)
                                                             # UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None
    
                             ngram_range=(1,1), #lower and upper boundary of the range of n-values for different n-grams.
                             max_df=1.0, #ignore terms that have a document frequency strictly higher than given threshold
                             min_df=0.001, #ignore terms that have a document frequency strictly lower than given threshold
                             max_features=50000, #build a vocabulary that only considers the top max_features ordered by term frequency acoss the corpus
                             vocabulary=None, #vocabulary is determined from the input documents
                             norm='l2',
                             use_idf=False,
                             )



def remove_punctuation(word):
    return ''.join([char for char in word if not char in string.punctuation+' '])

def tokenize(str_):
    tokens = [word.lemma_.lower() for word in nlp(str_) if not word.is_stop]
    tokens = [word.replace('\n', '') for word in tokens if not word.isnumeric() and len(remove_punctuation(word))!=0]
    return tokens






In [None]:
corpus = [TDMStudio.get_title_and_text(id2file(id_)) for id_ in list(relevant_set)+list(irrelevant_set)]
corpus = [f'{title}. {text}' for title,text in corpus]

In [82]:
info('Starting fit...')
X = vectorizer.fit_transform(corpus)
info('Getting vocab...')
vocab = vectorizer.vocabulary_
# info('Creatin X...')


2022-03-15 16:32:44.702214 [ [1;94mINFO[0m  ] Starting fit...
2022-03-15 17:02:53.554235 [ [1;94mINFO[0m  ] Getting vocab...


In [79]:
# ## TMP TF-DF
# from sklearn.model_selection import cross_validate

# cv_results = cross_validate(SVC(C=45,kernel='linear'),X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
# for metric in cv_results:
#     print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 25.1135
score_time: 5.3105
test_f1   : 0.8641
test_precision: 0.8953
test_recall: 0.8357
test_accuracy: 0.9798


In [80]:
# ## TMP TF-IDF
# from sklearn.model_selection import cross_validate
# clf = LogisticRegression(
#                          C=75,   
#                          penalty='l2',
#                          dual=False, # Prefer dual=False when n_samples>n_features
#                          tol=1e-4,  # tolerance
#                          fit_intercept=True, 
#                          intercept_scaling=1,
#                          class_weight=None,
#                          solver='lbfgs', #
#                          n_jobs=3,
#                         )
# cv_results = cross_validate(clf,X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
# for metric in cv_results:
#     print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 4.4411
score_time: 0.0134
test_f1   : 0.8609
test_precision: 0.9080
test_recall: 0.8190
test_accuracy: 0.9796


In [71]:
len(vocab)

50000

In [None]:
vocab[:10]

In [72]:
import numpy as np
y = np.zeros(shape=(X.shape[0],))
y[:len(relevant_set)]=1
y

array([1., 1., 1., ..., 0., 0., 0.])

### GridSearch SVM

In [73]:
import pandas as pd

from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'C':[1, 5, 7, 12, 15, 30, 45]}
svc = SVC(kernel='linear')

# parameters = {'C':[1,2,3,4,5,6], 'degree':[1,2,3,4,5,6,7]}
# svc = SVC(kernel='poly')
clf = GridSearchCV(svc, parameters,scoring='f1', cv=5,verbose=4)

results = clf.fit(X,y)
pd.DataFrame(results.cv_results_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END ...............................C=1;, score=0.861 total time=  21.3s
[CV 2/5] END ...............................C=1;, score=0.914 total time=  21.9s
[CV 3/5] END ...............................C=1;, score=0.875 total time=  21.6s
[CV 4/5] END ...............................C=1;, score=0.866 total time=  21.1s
[CV 5/5] END ...............................C=1;, score=0.865 total time=  21.9s
[CV 1/5] END ...............................C=5;, score=0.874 total time=  21.6s
[CV 2/5] END ...............................C=5;, score=0.906 total time=  22.6s
[CV 3/5] END ...............................C=5;, score=0.845 total time=  21.5s
[CV 4/5] END ...............................C=5;, score=0.873 total time=  22.5s
[CV 5/5] END ...............................C=5;, score=0.873 total time=  21.9s
[CV 1/5] END ...............................C=7;, score=0.864 total time=  21.6s
[CV 2/5] END ...............................C=7;,

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,17.883109,0.295611,3.678323,0.065936,1,{'C': 1},0.861244,0.914286,0.875,0.865672,0.865385,0.876317,0.019511,1
1,18.218803,0.394813,3.787276,0.092116,5,{'C': 5},0.874419,0.90566,0.84507,0.872549,0.873239,0.874188,0.019194,2
2,18.262239,0.343663,3.785404,0.109549,7,{'C': 7},0.86385,0.90566,0.84507,0.872549,0.85446,0.868318,0.020808,3
3,18.327838,0.36029,3.79764,0.113616,12,{'C': 12},0.86385,0.90566,0.84507,0.872549,0.85446,0.868318,0.020808,3
4,18.366613,0.355406,3.801493,0.108603,15,{'C': 15},0.86385,0.90566,0.84507,0.872549,0.85446,0.868318,0.020808,3
5,18.325565,0.348706,3.801071,0.108677,30,{'C': 30},0.86385,0.90566,0.84507,0.872549,0.85446,0.868318,0.020808,3
6,18.316983,0.36357,3.798982,0.118777,45,{'C': 45},0.86385,0.90566,0.84507,0.872549,0.85446,0.868318,0.020808,3


In [45]:
import pandas as pd
pd.DataFrame(results.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,23.965991,0.181866,4.984778,0.057421,0.5,linear,"{'C': 0.5, 'kernel': 'linear'}",0.833333,0.908213,0.841584,0.834171,0.831683,0.849797,0.029405,15
1,38.46213,0.20371,7.332617,0.124069,0.5,rbf,"{'C': 0.5, 'kernel': 'rbf'}",0.763441,0.833333,0.797872,0.758242,0.802139,0.791005,0.027554,16
2,88.925486,0.840081,20.795889,0.516221,0.5,poly,"{'C': 0.5, 'kernel': 'poly'}",0.036036,0.018182,0.053571,0.071429,0.054054,0.046654,0.01811,24
3,22.939071,0.157769,4.811371,0.107609,2.0,linear,"{'C': 2, 'kernel': 'linear'}",0.847619,0.915094,0.847619,0.866667,0.837438,0.862888,0.027762,6
4,39.243314,0.371683,7.544606,0.179221,2.0,rbf,"{'C': 2, 'kernel': 'rbf'}",0.830918,0.898551,0.854369,0.838384,0.834171,0.851278,0.024971,8
5,88.942258,1.092317,20.838665,0.520077,2.0,poly,"{'C': 2, 'kernel': 'poly'}",0.444444,0.439716,0.37037,0.4,0.464789,0.423864,0.034002,17
6,23.024777,0.388411,4.823193,0.090904,4.0,linear,"{'C': 4, 'kernel': 'linear'}",0.857143,0.904762,0.846154,0.872038,0.833333,0.862686,0.02459,7
7,39.206195,0.349346,7.587852,0.18947,4.0,rbf,"{'C': 4, 'kernel': 'rbf'}",0.825243,0.893204,0.84878,0.85,0.834171,0.85028,0.023372,9
8,88.421095,0.765426,20.816134,0.519039,4.0,poly,"{'C': 4, 'kernel': 'poly'}",0.444444,0.439716,0.37037,0.4,0.464789,0.423864,0.034002,17
9,22.664602,0.177679,4.801197,0.103177,6.0,linear,"{'C': 6, 'kernel': 'linear'}",0.851675,0.909953,0.846154,0.872038,0.84058,0.86408,0.025279,1


In [74]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(SVC(C=45,kernel='linear'),X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
for metric in cv_results:
    print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 18.0183
score_time: 3.6813
test_f1   : 0.8763
test_precision: 0.9192
test_recall: 0.8376
test_accuracy: 0.9818


### GridSearch Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
                         penalty='l2',
                         dual=True, # Prefer dual=False when n_samples>n_features
                         tol=1e-4,  # tolerance
#                          C=1,       # Regularization strength. Smaller value specify stronger regularization
                         fit_intercept=True, 
                         intercept_scaling=1,
                         class_weight=None,
                         solver='lbfgs', #
                         n_jobs=3,
                        )

In [84]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

parameters = {'C':[0.5, 1,2,5,10,20,50,100, 200,500]}
clf = LogisticRegression(
                         penalty='l2',
                         dual=False, # Prefer dual=False when n_samples>n_features
                         tol=1e-4,  # tolerance
                         fit_intercept=True, 
                         intercept_scaling=1,
                         class_weight=None,
                         solver='lbfgs', #
                         n_jobs=3,
                        )

# parameters = {'C':[1,2,3,4,5,6], 'degree':[1,2,3,4,5,6,7]}
# svc = SVC(kernel='poly')
clf = GridSearchCV(clf, parameters,scoring='f1', cv=5,verbose=4)

results = clf.fit(X,y)
pd.DataFrame(results.cv_results_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .............................C=0.5;, score=0.783 total time=   2.0s
[CV 2/5] END .............................C=0.5;, score=0.822 total time=   1.5s
[CV 3/5] END .............................C=0.5;, score=0.832 total time=   1.8s
[CV 4/5] END .............................C=0.5;, score=0.832 total time=   1.5s
[CV 5/5] END .............................C=0.5;, score=0.833 total time=   2.1s
[CV 1/5] END ...............................C=1;, score=0.826 total time=   1.8s
[CV 2/5] END ...............................C=1;, score=0.873 total time=   2.4s
[CV 3/5] END ...............................C=1;, score=0.856 total time=   1.5s
[CV 4/5] END ...............................C=1;, score=0.828 total time=   1.9s
[CV 5/5] END ...............................C=1;, score=0.857 total time=   1.8s
[CV 1/5] END ...............................C=2;, score=0.845 total time=   2.3s
[CV 2/5] END ...............................C=2;

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END .............................C=100;, score=0.909 total time=   5.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END .............................C=100;, score=0.865 total time=   5.2s
[CV 4/5] END .............................C=100;, score=0.873 total time=   4.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .............................C=100;, score=0.871 total time=   4.8s
[CV 1/5] END .............................C=200;, score=0.867 total time=   3.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END .............................C=200;, score=0.909 total time=   5.0s
[CV 3/5] END .............................C=200;, score=0.861 total time=   4.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END .............................C=200;, score=0.873 total time=   5.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .............................C=200;, score=0.863 total time=   4.8s
[CV 1/5] END .............................C=500;, score=0.872 total time=   4.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END .............................C=500;, score=0.919 total time=   4.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END .............................C=500;, score=0.857 total time=   5.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END .............................C=500;, score=0.873 total time=   4.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .............................C=500;, score=0.863 total time=   5.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.770763,0.241706,0.010311,0.002912,0.5,{'C': 0.5},0.783069,0.822335,0.832487,0.831579,0.833333,0.820561,0.01916,10
1,1.868165,0.295042,0.010167,0.003758,1.0,{'C': 1},0.825871,0.872549,0.855721,0.828283,0.857143,0.847913,0.018024,9
2,2.238873,0.09395,0.008635,0.001408,2.0,{'C': 2},0.84466,0.884615,0.858537,0.851485,0.864078,0.860675,0.01364,8
3,2.334587,0.289032,0.01027,0.002736,5.0,{'C': 5},0.850242,0.898551,0.865385,0.851485,0.864078,0.865948,0.017451,7
4,2.841242,0.179013,0.009757,0.002278,10.0,{'C': 10},0.855769,0.898551,0.870813,0.866995,0.864078,0.871241,0.014524,6
5,3.817904,0.419928,0.00791,0.000234,20.0,{'C': 20},0.855769,0.898551,0.859903,0.871287,0.873786,0.871859,0.014955,5
6,4.104036,0.20691,0.014064,0.006641,50.0,{'C': 50},0.855769,0.909091,0.865385,0.876847,0.875,0.876418,0.017988,2
7,4.816003,0.277524,0.008611,0.00125,100.0,{'C': 100},0.861244,0.909091,0.865385,0.872549,0.870813,0.875816,0.017112,3
8,4.640356,0.424393,0.008966,0.001205,200.0,{'C': 200},0.866667,0.909091,0.861244,0.872549,0.862559,0.874422,0.017776,4
9,4.899086,0.141858,0.013771,0.008593,500.0,{'C': 500},0.872038,0.919431,0.857143,0.872549,0.862559,0.876744,0.022124,1


In [86]:
from sklearn.model_selection import cross_validate
clf = LogisticRegression(
                         C=500,   
                         penalty='l2',
                         dual=False, # Prefer dual=False when n_samples>n_features
                         tol=1e-4,  # tolerance
                         fit_intercept=True, 
                         intercept_scaling=1,
                         class_weight=None,
                         solver='lbfgs', #
                         n_jobs=3,
                         max_iter=10000
                        )
cv_results = cross_validate(clf,X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
for metric in cv_results:
    print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 5.0364
score_time: 0.0222
test_f1   : 0.8767
test_precision: 0.9092
test_recall: 0.8468
test_accuracy: 0.9816


### Distil-BERT embeddigs

In [87]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

ConnectionError: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/models/sentence-transformers/distilbert-base-nli-mean-tokens (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fcc867de3d0>: Failed to establish a new connection: [Errno 101] Network is unreachable'))

In [None]:
X = model.enconde(corpus,show_progress_bar=True)