In [1]:
import re 
import metaknowledge as mk
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import nltk as nltk
import scipy.sparse as sparse
from wordcloud import WordCloud
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
from numpy import array
from numpy import count_nonzero
import csv

<h3>1. First Step: getting the files with the data.</h3><p>Following the imports for the project. We have to download the data from  <a href="https://pubmed.ncbi.nlm.nih.gov/" target="_blank">Pubmed</a>. We make a request with the term 'EEG' and filter on one year. We download the file with 'Send to' 'Citation manager' for each year. We shall have (as for the 5th of April 2021):<p>
<ul><li>2018 we shall have 8,097 notices.</li><li>For 2019 should be 8,160 notices</li><li>2020 we shall have 7661 notices</li></ul></br>
<p>These notices shall be in a .nbib format.</p>

<h3>2. Getting each file in a record collection</h3><p>Using metaknowledge Python library. We ought load the files in a record collection. One collection for each year. We want to compare each year in a TF-IDF matrix to do so each year would be a document therefor each year would be a row in the matrix.<p> 

In [2]:
RC2020 = mk.RecordCollection("pubmed-eeg-set2020.nbib")

In [3]:
RC2019 = mk.RecordCollection("pubmed-eeg-set2019.nbib")

In [4]:
RC2018 = mk.RecordCollection("pubmed-eeg-set2018.nbib")

<h3>3. Extract MeSH Terms</h3>
<p>We know that Pubmed use abbreviation to identify data field (<a href="https://www.nlm.nih.gov/bsd/mms/medlineelements.html#:~:text=MEDLINE%20%C2%AE%20%2FPubMed%20%C2%AE%20Data%20Element%20%28Field%29%20Descriptions,%20%20%28LA%29%20%2018%20more%20rows%20" target="_blank">MEDLINE®/PubMed® Data Element (Field) Descriptions</a>). We have seen in the first test that these abbreviations are use as column's title in the .nbib files. So in order not to make this Notebook file loud we will skip the conversion of our records to dataframes only to have a look at the data. It has been done in previous phase of the project.</p><p>We would create a variable for the MeSH column as a list. The column containing the MeSH terms have an MH abbreviation.</p>

In [5]:
selectionMesh = ['MH']

<h3>3.1 Extract MeSH Terms as Dataframe</h3>
<p>Next we would create a dataframe in the form of Python dictionnary. We will use an index as keys and the MeSH terms as values.</p>

In [6]:
noticesMesh2020 = pd.DataFrame(RC2020.makeDict(onlyTheseTags=selectionMesh, genderCounts=False, numAuthors=False))

In [7]:
noticesMesh2019 = pd.DataFrame(RC2019.makeDict(onlyTheseTags = selectionMesh, genderCounts = False, numAuthors = False))

In [8]:
noticesMesh2018 = pd.DataFrame(RC2018.makeDict(onlyTheseTags = selectionMesh, genderCounts = False, numAuthors = False))

In [9]:
noticesMesh2018

Unnamed: 0,MH
0,"[Adult, Brain Waves/physiology, Chorea/*etiolo..."
1,"[Aged, Brain/physiopathology, Electroencephalo..."
2,"[Adult, Anticipation, Psychological/*physiolog..."
3,"[Brain/*pathology, Canada, *Electroencephalogr..."
4,"[*Body Image, Brain Mapping, Cerebral Cortex/d..."
...,...
8092,"[Adult, Anticipation, Psychological/*physiolog..."
8093,"[Adolescent, Brain/diagnostic imaging/*physiop..."
8094,"[Algorithms, Alzheimer Disease/*diagnosis, Bay..."
8095,


<h3>4. Delete empty rows</h3>
<p>We use dropna() method on each dataframe. Notices with no MeSH terms won't be useful.</p>

In [10]:
noticesMesh2018 = noticesMesh2018.dropna()

In [11]:
noticesMesh2019 = noticesMesh2019.dropna()

In [12]:
noticesMesh2020 = noticesMesh2020.dropna()

<h3>5. Clean the Data.</h3>
<p>We tried many approaches. We don't claim that the one we selected is the best. We selected this one because we had more accurate results than the other approaches that we tried in a format that we were able to convert in a TF-IDF matrix. We shall create csv files for each year and load these csv files to join our data into a Python string.</p>

In [13]:
noticesMesh2018.to_csv("meshDrop2018.csv", index=False, header=False)

In [14]:
noticesMesh2019.to_csv("meshDrop2019.csv", index=False, header=False)

In [15]:
noticesMesh2020.to_csv("meshDrop2020.csv", index=False, header=False)

In [16]:
mesh2018 = []
with open('meshDrop2018.csv', 'r')as f:
    reader = csv.reader(f)
    mesh2018 = ' '.join([i[0] for i in reader])

In [17]:
mesh2019 = []
with open('meshDrop2019.csv', 'r')as f:
    reader = csv.reader(f)
    mesh2019 = ' '.join([i[0] for i in reader])

In [18]:
mesh2020 = []
with open('meshDrop2020.csv', 'r')as f:
    reader = csv.reader(f)
    mesh2020 = ' '.join([i[0] for i in reader])

<h3>5.1 Clean the Data : the Replacement</h3>
<p>Some characters are not needed such as '*' and '\'. We'll replace them with blank '' using replace method.</p>

In [19]:
mesh2018 = mesh2018.replace('*','')

In [20]:
mesh2018 = mesh2018.replace("\'","")

In [21]:
mesh2019 = mesh2019.replace('*','')

In [22]:
mesh2019 = mesh2019.replace("\'","")

In [23]:
mesh2020 = mesh2020.replace('*','')

In [24]:
mesh2020 = mesh2020.replace("\'","")

<h3>5.3 Clean the Data: Stop Words</h3>
<p>We shall remove common generic words from our MeSH terms data. We will the standard stop_words for English in the NLTK library. We will add to this list MeSH terms concerning 'subject' or 'participants' of articles rather than topics. MeSH terms such as Humans, Adult, Students, etc. Since we're looking for fields in witch EEG method is used. The MeSH terms representing the 'subject' or 'participants' of a study or experiment are not of interest for our question. </p><p>There are animals MeSH terms as well. For human related participant in MeSH terms. The whole list of terms appear in the <a href="https://meshb.nlm.nih.gov/treeView" target="_blank">MeSH Browser thesaurus</a> under the MeSH term 'person'. MeSH terms related to animals are not grouped in one place. We may take a few into account like 'mice' and 'rat' but we don't know if it's complete or not.</p>
<p>On a similar concern: there are MeSH terms representing products, drugs or chemicals and the like such as GABA here and there. Idealy they should be remove from our datasets. Here we are limited by our own knowledge of the EEG method. We don't know these products.</p>

In [25]:
listMeshPart = ['Adult', 'Child', 'Men', 'Women', 'Students', 'Humans', 'Male', 'Female', 'Athletes', 'Adolescent', 'Infant', 'Aged', 'Middle Aged', 'Young Adult', 'Animals', 'Mice', 'parent', 'Young', 'Rat', 'Electroencephalography','Electroencephalography/methods', 'Method', 'imaging', 'potential','diagnosis','diagnostic','Evoked', 'drug']


In [26]:
stop_words = set(stopwords.words('english'))
MeshSub2018 = stop_words.union(listMeshPart)
word_tokens = word_tokenize(mesh2018)

filtSent2018 = [] 

for w in word_tokens:  
    if w not in MeshSub2018:  
        filtSent2018.append(w)

In [27]:
stop_words = set(stopwords.words('english'))
MeshSub2019 = stop_words.union(listMeshPart)
word_tokens = word_tokenize(mesh2019)

filtSent2019 = [] 

for w in word_tokens:  
    if w not in MeshSub2019:  
        filtSent2019.append(w)

In [28]:
stop_words = set(stopwords.words('english'))
MeshSub2020 = stop_words.union(listMeshPart)
word_tokens = word_tokenize(mesh2020)

filtSent2020 = [] 

for w in word_tokens:  
    if w not in MeshSub2020:  
        filtSent2020.append(w)

In [29]:
MeshSub2018

{'Adolescent',
 'Adult',
 'Aged',
 'Animals',
 'Athletes',
 'Child',
 'Electroencephalography',
 'Electroencephalography/methods',
 'Evoked',
 'Female',
 'Humans',
 'Infant',
 'Male',
 'Men',
 'Method',
 'Mice',
 'Middle Aged',
 'Rat',
 'Students',
 'Women',
 'Young',
 'Young Adult',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'diagnosis',
 'diagnostic',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'drug',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'imaging',
 'in',
 'into',
 'is',
 'isn',
 "i

In [30]:
filtSent2018

['[',
 ',',
 'Brain',
 'Waves/physiology',
 ',',
 'Chorea/etiology',
 ',',
 ',',
 'Epilepsy/complications',
 ',',
 ',',
 ']',
 '[',
 ',',
 'Brain/physiopathology',
 ',',
 ',',
 'Encephalitis/complications/drug',
 'therapy/immunology',
 ',',
 'Glucocorticoids/therapeutic',
 'use',
 ',',
 ',',
 'Intracellular',
 'Signaling',
 'Peptides',
 'Proteins',
 ',',
 ',',
 'Methylprednisolone/therapeutic',
 'use',
 ',',
 'Proteins/immunology',
 ',',
 'Seizures/complications/drug',
 'therapy/immunology',
 ']',
 '[',
 ',',
 'Anticipation',
 ',',
 'Psychological/physiology',
 ',',
 'Auditory',
 'Perception/physiology',
 ',',
 'Cues',
 ',',
 ',',
 'Potentials/physiology',
 ',',
 'Feedback',
 ',',
 'Psychological/physiology',
 ',',
 ',',
 ',',
 ',',
 'Psychomotor',
 'Performance/physiology',
 ',',
 'Reward',
 ',',
 'Visual',
 'Perception/physiology',
 ',',
 ']',
 '[',
 'Brain/pathology',
 ',',
 'Canada',
 ',',
 ',',
 ',',
 ',',
 'Hypothermia',
 ',',
 'Induced/methods',
 ',',
 'Hypoxia-Ischemia',
 ',',


<h3>5.4 Clean the Data: Everything Else</h3>
<p>Trouble with dataset that large is we cannot see the whole picture for sure. We will pass throught a cleaning loop including a few elements. We're not certain that all these steps are useful. On more than 10 000 MeSH terms there might be some extra spaces, there might be some single charactere completly meaningless here and there.</p><p>There are some single numbers like '15'. Again we are limited by our own knowledge.</p><p>We'll use lemmatization in the same perspective. We cannot be certain of the usefulness of the feature. But since we're cleaning the data and if it is possible to lemmatize, in general it would be better if our datasets are lemmatized. This is a little uncertain concerning keywords. Again there's a degre of ambiguity. Above all they are 'in case' instructions. </p>

In [31]:
docMesh2018 = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(filtSent2018)):
    # Remaining special characters
    doc1 = re.sub(r'\W', ' ', str(filtSent2018[sen]))
    # single characters
    doc1 = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc1)
    # multiple spaces to single space
    doc1 = re.sub(r'\s+', ' ', doc1, flags=re.I)
    # Lowercase
    doc1 = doc1.lower()
    # Lemmatization
    doc1 = doc1.split()
     
    doc1 = [stemmer.lemmatize(word) for word in doc1]
    doc1 = ' '.join(doc1)
    
    docMesh2018.append(doc1)
    docMesh2018 = [i for i in docMesh2018 if i]

In [32]:
docMesh2019 = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(filtSent2019)):
    # Remaining special characters
    doc2 = re.sub(r'\W', ' ', str(filtSent2019[sen]))
    # single characters
    doc2 = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc2)
    # multiple spaces to single space
    doc2 = re.sub(r'\s+', ' ', doc2, flags=re.I)
    # Lowercase
    doc2 = doc2.lower()
    # Lemmatization
    doc2 = doc2.split()
     
    doc2 = [stemmer.lemmatize(word) for word in doc2]
    doc2 = ' '.join(doc2)
    
    docMesh2019.append(doc2)
    docMesh2019 = [i for i in docMesh2019 if i]

In [33]:
docMesh2020 = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(filtSent2020)):
    # Remaining special characters
    doc3 = re.sub(r'\W', ' ', str(filtSent2020[sen]))
    # single characters
    doc3 = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc3)
    # multiple spaces to single space
    doc3 = re.sub(r'\s+', ' ', doc3, flags=re.I)
    # Lowercase
    doc3 = doc3.lower()
    # Lemmatization
    doc3 = doc3.split()
     
    doc3 = [stemmer.lemmatize(word) for word in doc3]
    doc3 = ' '.join(doc3)
    
    docMesh2020.append(doc3)
    docMesh2020 = [i for i in docMesh2020 if i]

In [34]:
docMesh2020

['brain physiology',
 'brain computer',
 'interface',
 'covid 19 psychology',
 'mental',
 'process physiology',
 'occupational',
 'health statistic',
 'numerical',
 'data',
 'ocean',
 'sea',
 'pattern',
 'recognition',
 'automated method',
 'brain physiology',
 'chronic',
 'pain',
 'magnetic',
 'resonance',
 'imaging',
 'pain',
 'management',
 'pain',
 'perception',
 'professional patient',
 'relation',
 'cerebral',
 'cortex diagnostic',
 'imaging physiology',
 'electroencephalography standard',
 'functional',
 'neuroimaging standard',
 'magnetoencephalography standard',
 'nerve',
 'net diagnostic',
 'imaging physiology',
 'sensitivity',
 'specificity',
 '80',
 'brain diagnostic',
 'imaging pathology',
 'image',
 'interpretation',
 'computer assisted method',
 'magnetic',
 'resonance',
 'imaging method',
 'middle',
 'neuroimaging method',
 'seizure diagnostic',
 'imaging pathology',
 'potential physiology',
 'feedback',
 'sensory physiology',
 'pitch',
 'perception physiology',
 'singi

In [35]:
docMesh2019

['algorithm',
 'attention',
 'auditory',
 'perception',
 'electroencephalography instrumentation',
 'hearing',
 'aid',
 'magnetoencephalography instrumentation',
 'speech',
 'perception',
 'brain physiology',
 'computer assisted',
 'instruction method',
 'forecasting',
 'learning physiology',
 'motivation physiology',
 'photic',
 'stimulation method',
 'anticonvulsant',
 'preschool',
 'follow up',
 'study',
 'methylprednisolone therapeutic',
 'use',
 'retrospective',
 'study',
 'sleep',
 'status',
 'epilepticus drug',
 'therapy',
 'association',
 'learning physiology',
 'attention physiology',
 'cue',
 'eye',
 'movement',
 'measurement',
 'pattern',
 'recognition',
 'visual physiology',
 'psychomotor',
 'performance physiology',
 'uncertainty',
 'action',
 'potential',
 'brain physiology',
 'brain',
 'wave',
 'cortical',
 'synchronization',
 'microelectrodes',
 'periodicity',
 'rat',
 'wistar',
 'tissue',
 'culture',
 'technique method',
 'azepines pharmacology',
 'cataplexy drug',
 't

In [36]:
docMesh2018

['brain',
 'wave physiology',
 'chorea etiology',
 'epilepsy complication',
 'brain physiopathology',
 'encephalitis complication drug',
 'therapy immunology',
 'glucocorticoid therapeutic',
 'use',
 'intracellular',
 'signaling',
 'peptide',
 'protein',
 'methylprednisolone therapeutic',
 'use',
 'protein immunology',
 'seizure complication drug',
 'therapy immunology',
 'anticipation',
 'psychological physiology',
 'auditory',
 'perception physiology',
 'cue',
 'potential physiology',
 'feedback',
 'psychological physiology',
 'psychomotor',
 'performance physiology',
 'reward',
 'visual',
 'perception physiology',
 'brain pathology',
 'canada',
 'hypothermia',
 'induced method',
 'hypoxia ischemia',
 'brain diagnosis',
 'newborn',
 'magnetic',
 'resonance',
 'imaging',
 'neurologic',
 'examination',
 'predictive',
 'value',
 'test',
 'prospective',
 'study',
 'seizure diagnosis etiology',
 'severity',
 'illness',
 'index',
 'body',
 'image',
 'brain',
 'mapping',
 'cerebral',
 'cort

In [37]:
strlist2018 = str(docMesh2018)
strlist2019 = str(docMesh2019)
strlist2020 = str(docMesh2020)
docMesh2018 = [strlist2018]
docMesh2019 = [strlist2019]
docMesh2020 = [strlist2020]

In [38]:
for i in docMesh2018:
 docMesh2020.append(i)

In [39]:
for i in docMesh2019:
 docMesh2020.append(i)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase = False)
sparseMesh = tfidf.fit_transform(docMesh2020)

print("Tokens used as features are : ")
print(tfidf.get_feature_names())
tokens = tfidf.get_feature_names()
print("How many columns : ", len(tokens))

print("\n Size of array. Each row represents a document. Each column represents a feature/token")
print(sparseMesh.shape)

print("\n Actual TF-IDF array")
text1 = sparseMesh.toarray()
print(text1)

Tokens used as features are : 
['10', '11', '12', '129', '13', '14', '15', '15th', '16', '16th', '17th', '18th', '19', '19th', '1beta', '20', '2001', '2003', '2011', '20th', '21', '21st', '22', '22q11', '25', '26', '3a', '3t3', '43', '450', '50', '80', 'a1', 'a2', 'a2a', 'abattoir', 'abdomen', 'abdominal', 'abducens', 'aberration', 'ablation', 'abnormal', 'abnormality', 'abrin', 'abscess', 'absence', 'absorbable', 'absorption', 'abstinence', 'abuse', 'academic', 'academy', 'acceleration', 'accelerometry', 'acceptance', 'acceptor', 'access', 'accessibility', 'accident', 'accidental', 'acclimatization', 'accommodation', 'accumbens', 'accuracy', 'acepromazine', 'acetaldehyde', 'acetamide', 'acetaminophen', 'acetate', 'acetazolamide', 'acetophenones', 'acetylation', 'acetylcholine', 'acetylcholinesterase', 'acetylcysteine', 'acetylglucosaminidase', 'acetyltransferase', 'acetyltransferases', 'achievement', 'achilles', 'aci', 'acid', 'acidemia', 'acidic', 'acidosis', 'aciduria', 'acoustic', 

In [41]:
resultat = pd.DataFrame(sparseMesh[0].T.todense(), index = tokens, columns=["TF-IDF"])
resultat = resultat.loc[(resultat != 0).all(axis=1), :]
resultat = resultat.sort_values('TF-IDF', ascending=False)
print (resultat.head(10))

                   TF-IDF
physiology       0.710576
physiopathology  0.327115
brain            0.234351
method           0.188431
imaging          0.150032
diagnosis        0.149505
drug             0.145414
diagnostic       0.118099
therapy          0.105564
potential        0.102529


In [42]:
resultat = pd.DataFrame(sparseMesh[2].T.todense(), index = tokens, columns=["TF-IDF"])
resultat = resultat.loc[(resultat != 0).all(axis=1), :]
resultat = resultat.sort_values('TF-IDF', ascending=False)
print (resultat.head(10))

                   TF-IDF
physiology       0.777444
physiopathology  0.304373
brain            0.211632
method           0.166022
drug             0.119378
diagnosis        0.118465
imaging          0.117918
potential        0.110255
effect           0.089153
middle           0.088484


In [43]:
resultat = pd.DataFrame(sparseMesh[1].T.todense(), index = tokens, columns=["TF-IDF"])
resultat = resultat.loc[(resultat != 0).all(axis=1), :]
resultat = resultat.sort_values('TF-IDF', ascending=False)
print (resultat.head(10))

                   TF-IDF
physiology       0.771337
physiopathology  0.292214
brain            0.228200
method           0.163495
imaging          0.132757
drug             0.123818
potential        0.116205
diagnosis        0.113091
diagnostic       0.099250
effect           0.093945
