This Jupyter notebook file will concern itself with the Topic Classification part of the project. Specifically, the **textcat** pipeline that will be added to our spaCy model.

In [12]:
import spacy

nlp = spacy.load("en_core_web_lg")

# Test the model on some text
doc = nlp("This is a test article about technology and politics.")
doc.cats = ["label1", "label3", "label5", "label10", "label20"]
print(doc.cats)

['label1', 'label3', 'label5', 'label10', 'label20']


In [3]:
from datasets import load_dataset

dataset = load_dataset("reuters21578", 'ModApte')

test_data = dataset['train']
test_data = dataset['test']
test_data = dataset['unused']

Found cached dataset reuters21578 (C:/Users/Genis/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
example_text = test_data[244]['text']
example_topics = test_data[244]['topics']
print(example_text, example_topics)

The European Community Commission said
it has opened an enquiry into allegations that the Soviet Union
is dumping mercury on the European market at below-cost prices.
    The Commission said its decision follows a complaint from
EC non-ferrous metals producers that the sales of Soviet
mercury were harming their business and threatening jobs in the
European industry.
    According to the complaint, Soviet mercury sales in the EC
had risen from zero in recent years to 100 tonnes between
August and October last year and threaten to capture 25 pct of
the EC market if they continue at the same pace.
    The industry said the mercury was being sold at more than
40 pct below prices charged by EC producers, forcing them to
cut their prices to levels that no longer covered costs. The
imports had caused producers heavy financial losses, it said.
    The Commission said the industry would probably be unable
to hold prices at current levels and that any increase would
result in loss of sales and j

In [27]:
all_topics = {}

for sample in test_data:
    for topic in sample['topics']:
        all_topics[topic] = 0
        
for sample in test_data:
    for topic in sample['topics']:
        all_topics[topic] = 0
        
for sample in test_data:
    for topic in sample['topics']:
        all_topics[topic] = 0        

In [61]:
import pandas as pd
from sklearn.utils import shuffle


with open('data/pubmed200k_train.txt', 'r') as f:
    rows = []
    for line in f:
        # Split each line into columns based on the tab delimiter
        columns = line.strip().split('\t')
        rows.append(columns)
    
# Create a pandas DataFrame from the list of rows
df = pd.DataFrame(rows, columns=['Section', 'Text'])
df = df.dropna()
df = shuffle(df)
df

Unnamed: 0,Section,Text
948866,CONCLUSIONS,These findings support the American College of...
1104766,OBJECTIVE,Insulin resistance is associated with abnormal...
1945806,METHODS,Data were analysed with PC20 as the dependent ...
1087697,RESULTS,"Overall , the difference in photopic and mesop..."
1337463,RESULTS,All control subjects underwent overnight polys...
...,...,...
645464,METHODS,The psycho-educational program consisted of te...
648850,METHODS,Propofol was infused using a target-controlled...
53760,BACKGROUND,The solubility of valsartan is dependent on pH...
928495,METHODS,"Seven-week exposure ( 100 , 150 , 200mg/day ) ..."


In [67]:
from spacy.tokens import DocBin
import spacy

nlp = spacy.blank("en")

all_topics = {'CONCLUSIONS': 0, 'RESULTS': 0, 'METHODS': 0, 'BACKGROUND': 0, 'OBJECTIVE': 0}
    
db = DocBin()
split_name = 'train_full_2'

for i in range(len(df.values) // 2, len(df.values)):
    doc = nlp(df.values[i][1])
    # print(doc)
    doc.cats = all_topics.copy()
    # print('BEFORE: ', doc.cats)
    doc.cats[df.values[i][0]] = 1
    # print('After: ', doc.cats)
    # print('')

    if (i % 10000 == 0):
        print("{}/{}".format(i, len(df.values)))  
    db.add(doc)
    

db.to_disk('./models/fine_tune/corpus/pubmed200k_{}.spacy'.format(split_name))    

1110000/2211861
1120000/2211861
1130000/2211861
1140000/2211861
1150000/2211861
1160000/2211861
1170000/2211861
1180000/2211861
1190000/2211861
1200000/2211861
1210000/2211861
1220000/2211861
1230000/2211861
1240000/2211861
1250000/2211861
1260000/2211861
1270000/2211861
1280000/2211861
1290000/2211861
1300000/2211861
1310000/2211861
1320000/2211861
1330000/2211861
1340000/2211861
1350000/2211861
1360000/2211861
1370000/2211861
1380000/2211861
1390000/2211861
1400000/2211861
1410000/2211861
1420000/2211861
1430000/2211861
1440000/2211861
1450000/2211861
1460000/2211861
1470000/2211861
1480000/2211861
1490000/2211861
1500000/2211861
1510000/2211861
1520000/2211861
1530000/2211861
1540000/2211861
1550000/2211861
1560000/2211861
1570000/2211861
1580000/2211861
1590000/2211861
1600000/2211861
1610000/2211861
1620000/2211861
1630000/2211861
1640000/2211861
1650000/2211861
1660000/2211861
1670000/2211861
1680000/2211861
1690000/2211861
1700000/2211861
1710000/2211861
1720000/2211861
1730000/

Test on document

In [1]:
import re
from nltk.tokenize import sent_tokenize
from process_single_document import extract_text_from_pdf, remove_references, try_finding_keywords

##### Document extraction and pre-processing (Removing Referencesn & Potentially finding Keywords) 
single_pdf = open('data/implementome_publications/test_miner/child_obesity_switzerland.pdf', 'rb')
doc_as_str, doc_as_list = extract_text_from_pdf(single_pdf)
# doc_as_str = extract_text_from_pdf_2(single_pdf)
doc_as_str = remove_references(doc_as_str)
doc_keywords = try_finding_keywords(doc_as_str)

###### Tokenization process of the string containing the entire document text
###### Regular Expression pattern removes '\n' characters and tries to concatenate words separated by a '-'
sentences = sent_tokenize(doc_as_str)
pattern = r'(?<![a-zA-Z])-|-(?![a-zA-Z])'
sentences = [re.sub(pattern, '', sentence.replace('\n', ' ')) for sentence in sentences]



Text from PDF File (11 pages) extracted successfully.
Original document character length: 44197
References removed, new document character length: 36152


In [13]:
import spacy

textcat_nlp = spacy.load('./models/pubmed-model-best')
docs = list(textcat_nlp.pipe(sentences))
resulted_topics = {'CONCLUSIONS': [], 'RESULTS': [], 'METHODS': [], 'BACKGROUND': [], 'OBJECTIVE': []}

for doc in docs:
    # print('Text: ', doc)
    # print('Classification: ', doc.cats)
    for label, score in doc.cats.items():
        if score >= 0.5:
            resulted_topics[label].append(doc.text)

In [16]:
example_doc = textcat_nlp("In the Swiss population, 13% of the children have parents from Southern Europe and the proportion of obesity is 57 and 42% in these boys and girls, respectively.")
example_doc.cats

{'CONCLUSIONS': 2.061153470123145e-09,
 'RESULTS': 0.6469292640686035,
 'METHODS': 0.0021996202412992716,
 'BACKGROUND': 2.061153470123145e-09,
 'OBJECTIVE': 2.061153470123145e-09}

##### Automatic MeSh indexing

Query diseases to their corresponding representation in MeSh Terms, potentially locating their **[C]** - Disease Category.

In [1]:
from query_for_mesh_terms import get_mesh_tree

queries = ['child obesity', 'aids', 'coronary heart disease', 'hiv', 'malaria', 'covid', 'asthma', 'hypertension', 'diabetes']
# queries = ['diabetes mellitus']
for query in queries:
    mesh_terms = get_mesh_tree(query)
    print('\nFor Queried Disease: {}'.format(query))
    print('Potential Corresponding MeSh Tree: {}'.format(mesh_terms))
    print('')


For Queried Disease: child obesity
Potential Corresponding MeSh Tree: ['Nutritional and Metabolic Diseases', 'Nutrition Disorders', 'Overnutrition', 'Overweight']


For Queried Disease: aids
Potential Corresponding MeSh Tree: ['Infections', 'Communicable Diseases', 'Blood-Borne Infections', 'HIV Infections']


For Queried Disease: coronary heart disease
Potential Corresponding MeSh Tree: ['Cardiovascular Diseases', 'Heart Diseases', 'Myocardial Ischemia', 'Coronary Disease']


For Queried Disease: hiv
Potential Corresponding MeSh Tree: []


For Queried Disease: malaria
Potential Corresponding MeSh Tree: ['Infections', 'Parasitic Diseases', 'Protozoan Infections', 'Malaria']


For Queried Disease: covid
Potential Corresponding MeSh Tree: []


For Queried Disease: asthma
Potential Corresponding MeSh Tree: ['Respiratory Tract Diseases', 'Bronchial Diseases', 'Asthma', 'Asthma, Aspirin-Induced']


For Queried Disease: hypertension
Potential Corresponding MeSh Tree: ['Cardiovascular Diseas

In [1]:
from datasets import load_dataset

dataset = load_dataset("eurlex")
dataset

No config specified, defaulting to: eurlex/eurlex57k
Found cached dataset eurlex (C:/Users/Genis/.cache/huggingface/datasets/eurlex/eurlex57k/1.1.0/d2fdeaa4fcb5f41394d2ed0317c8541d7f9be85d2d601b9fa586c8b461bc3a34)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['celex_id', 'title', 'text', 'eurovoc_concepts'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['celex_id', 'title', 'text', 'eurovoc_concepts'],
        num_rows: 6000
    })
    validation: Dataset({
        features: ['celex_id', 'title', 'text', 'eurovoc_concepts'],
        num_rows: 6000
    })
})

In [2]:
train_set = dataset['train']
example_data = train_set[0]
example_data

{'celex_id': '32014R0727',
 'title': "Commission Implementing Regulation (EU) No 727/2014 of 30 June 2014 initiating a ‘new exporter’ review of Council Implementing Regulation (EU) No 1389/2011 imposing a definitive anti-dumping duty on imports of trichloroisocyanuric acid originating in the People's Republic of China, repealing the duty with regard to imports from one exporter in this country and making these imports subject to registration\n",
 'text': "1.7.2014 EN Official Journal of the European Union L 192/42\nCOMMISSION IMPLEMENTING REGULATION (EU) No 727/2014\nof 30 June 2014\ninitiating a ‘new exporter’ review of Council Implementing Regulation (EU) No 1389/2011 imposing a definitive anti-dumping duty on imports of trichloroisocyanuric acid originating in the People's Republic of China, repealing the duty with regard to imports from one exporter in this country and making these imports subject to registration\nTHE EUROPEAN COMMISSION\n,\nHaving regard to the Treaty on the Funct

In [3]:
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_json('./data/eurovoc_concepts.jsonl', lines=True)
finetuned_df = pd.read_json('./data/eurovoc_concepts_finetuned.jsonl', lines=True)
converting_dictionary = finetuned_df.set_index('id')['title'].to_dict()

# Print the resulting dictionary
print('Mapping: ', converting_dictionary)

all_titles = {title: 0 for title in df['title']}
all_titles_finetuned = {title: 0 for title in finetuned_df['title']}

print('Eurlex Dataset Label Count - Original: {} - Curated: {}'.format(len(all_titles), len(all_titles_finetuned)))

Mapping:  {'3474': 'international affairs', '3363': 'union representative', '4488': 'data processing', '6303': 'scientific discovery', '3842': 'scientific apparatus', '538': 'rights of the individual', '1909': 'migration', '1600': 'self-defence', '1592': 'anti-trust legislation', '851': 'sociocultural facilities', '1478': 'Workers International', '2577': 'power of implementation', '1641': 'trade licence', '2566': 'postal and telecommunications services', '3586': 'scientific profession', '1760': 'mental illness', '6035': 'urban community', '1172': 'government', '2581': 'power to negotiate', '4536': 'transport under customs control', '6084': 'European Works Council', '5184': 'programmes industry', '5970': 'surgery', '1625': 'freedom of assembly', '7363': 'graphic illustration', '2916': 'applied research', '1941': 'national minority', '7392': 'urban sociology', '582': 'political right', '6401': 'environmental liability', '6744': 'eugenics', '6033': 'rural community', '5938': 'transport of

In [4]:
def adjust_original_concepts(list_of_concepts, mapping_dictionary):
    adjusted_concepts = []
    for concept in list_of_concepts:
        try: 
            adjusted_concepts.append(mapping_dictionary[concept])
        except KeyError:
            continue
    return adjusted_concepts

In [9]:
for i in range(0, 10):
    sample_data = train_set[i]
    concepts = sample_data['eurovoc_concepts']
    print('Original: ', concepts)
    concepts = [converting_dictionary[concept] for concept in concepts if concept in converting_dictionary]
    print('Adjusted: ', concepts)
    print('')

Original:  ['1402', '2771', '3191', '5055', '519', '5969', '5971']
Adjusted:  []

Original:  ['2319', '2713', '2938', '693']
Adjusted:  []

Original:  ['3560', '365', '4256', '4261', '4353', '4585']
Adjusted:  []

Original:  ['1091', '3842', '3874', '4110', '4381', '5287']
Adjusted:  ['scientific apparatus']

Original:  ['1026', '1048', '2300', '3653', '4271', '4390']
Adjusted:  []

Original:  ['2081', '239', '2871', '4860', '5573', '893']
Adjusted:  []

Original:  ['2676', '4472', '6042']
Adjusted:  []

Original:  ['2282', '2437', '2879', '4320', '4790', '5254']
Adjusted:  []

Original:  ['191', '2232', '2415', '3579', '4490', '5100']
Adjusted:  []

Original:  ['1519', '235', '2783', '3892']
Adjusted:  []



In [15]:
from spacy.tokens import DocBin
import spacy

nlp = spacy.blank("en")
    
db = DocBin()
split_name = 'validation'
data = dataset[split_name]

count = 0
for i in range(0, len(data)):
    
    #### Performance Counter    
    if (i % 2500 == 0):
        print("{}/{}".format(i, len(data)))
        print("Selected Sentences: {} \n".format(count))  
   
    ##### Check that the given sentence contains any of our wanted labels
    concepts = [converting_dictionary[concept] for concept in data[i]['eurovoc_concepts'] if concept in converting_dictionary]
    
    if len(concepts) == 0:
        continue
    else:
        doc = nlp(data[i]['text'])
        # print(doc)
        doc.cats = all_titles_finetuned.copy()
        # print('BEFORE: ', doc.cats)
        for concept in concepts:
            doc.cats[concept] = 1
        # print('After: ', doc.cats)
        # print('')
        count += 1

    db.add(doc)
    

db.to_disk('./models/fine_tune/corpus/curated_eurlex_{}.spacy'.format(split_name))    

0/6000
Selected Sentences: 0 

2500/6000
Selected Sentences: 226 

5000/6000
Selected Sentences: 457 

