In [39]:
# load all the corresponding packages
import pandas as pd
from collections import Counter
from gensim.parsing.preprocessing import preprocess_string,strip_tags, strip_punctuation, remove_stopwords
import re
import seaborn as sns

In [92]:
# load the dataframe of your interest
records_dataframes = pd.read_csv("../Data/NLP/IL6.csv")
records_dataframes["class"] = "IL6"
records_dataframes = records_dataframes.iloc[:2500,]
# this can be run to show the effect
# will take long to complete
records_dataframe_2 = pd.read_csv("../Data/NLP/Karl_Deisseroth.csv")
records_dataframe_2["class"] = "Karl"
records_dataframes = pd.concat([records_dataframes, records_dataframe_2], axis = 0)
records_dataframes.shape

(2859, 9)

In [64]:
# To reduce computation costs, slice dataframe to 1000 files instead of 9926
# we could leave this step out

In [93]:
# get overview
records_dataframes.head()

Unnamed: 0.1,Unnamed: 0,ID,abstract,Title,Journal,Publication_date,first,last,class
0,0,36594412,1-Octacosanol (Octa) is reported to possess ma...,Orally administered octacosanol improves liver...,Food & function,2023 Jan 3,"Ding, Yin-Yi","Shen, Qing",IL6
1,1,36594097,Acute kidney injury (AKI) is a pathological co...,Direct targeting of sEH with alisol B alleviat...,International journal of biological sciences,2023,"Zhang, Juan","Ma, Xiao-Chi",IL6
2,2,36594093,Rheumatoid arthritis (RA) is a prototypic infl...,Nesfatin-1 Stimulates CCL2-dependent Monocyte ...,International journal of biological sciences,2023,"Chang, Jun-Way","Tang, Chih-Hsin",IL6
3,3,36594066,OBJECTIVES: To investigate the effect of mogro...,Inhibition of Mogroside IIIE on isoproterenol-...,Iranian journal of basic medical sciences,2023 Jan,"Yanan, Shi","Wei, Liu",IL6
4,4,36594061,"OBJECTIVES: The current study, the first of it...",Thymoquinone played a protective role against ...,Iranian journal of basic medical sciences,2023 Jan,"Demircigil, Nursena","Erdemli, Mehmet Erman",IL6


In [94]:
# this is important to get an overview over missing data and the inferred data types per column
print(records_dataframes.info())
# gets overview over the most frequent data 
# also usually used to retrieve min, max, mean for each column if numeric, int, float
print(records_dataframes.describe())
# exploratory data analysis
# using counter object from the collections package
top_journals = Counter(records_dataframes["Journal"].tolist())
top_journals

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2859 entries, 0 to 358
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        2859 non-null   int64 
 1   ID                2859 non-null   int64 
 2   abstract          2859 non-null   object
 3   Title             2859 non-null   object
 4   Journal           2859 non-null   object
 5   Publication_date  2859 non-null   object
 6   first             2859 non-null   object
 7   last              2859 non-null   object
 8   class             2859 non-null   object
dtypes: int64(2), object(7)
memory usage: 223.4+ KB
None
        Unnamed: 0            ID
count  2859.000000  2.859000e+03
mean     11.063659  3.513587e+07
std       8.413193  3.830235e+06
min       0.000000  1.197086e+07
25%       5.000000  3.633428e+07
50%      10.000000  3.641810e+07
75%      15.500000  3.650944e+07
max      49.000000  3.659441e+07


Counter({'Food & function': 19,
         'International journal of biological sciences': 2,
         'Iranian journal of basic medical sciences': 5,
         'Current research in pharmacology and drug discovery': 1,
         'BioMed research international': 5,
         'Medical science monitor : international medical journal of experimental and clinical research': 2,
         'American journal of reproductive immunology (New York, N.Y. : 1989)': 5,
         'In vivo (Athens, Greece)': 4,
         'Cellular and molecular gastroenterology and hepatology': 1,
         'Journal of ethnopharmacology': 31,
         'Neuroscience letters': 3,
         'Environmental research': 2,
         'Brain research': 2,
         'Journal of animal science': 1,
         'Transplantation and cellular therapy': 2,
         'Neurochemistry international': 1,
         'Journal of biotechnology': 1,
         'Archives of biochemistry and biophysics': 1,
         'Microbial pathogenesis': 10,
         'Europea

In [95]:
# getting directly from the pandas package
top_journals_df = records_dataframes["Journal"].value_counts()
top_journals_df

International journal of molecular sciences                            100
Frontiers in immunology                                                 79
Frontiers in pharmacology                                               54
Scientific reports                                                      50
Nutrients                                                               45
                                                                      ... 
Journal of the College of Physicians and Surgeons--Pakistan : JCPSP      1
American journal of Alzheimer's disease and other dementias              1
Reviews in endocrine & metabolic disorders                               1
Endocrine connections                                                    1
Trends in neurosciences                                                  1
Name: Journal, Length: 993, dtype: int64

In [96]:
# task evaluate it using your own function !
journals_list = records_dataframes["Journal"].tolist()
def counter(record_list, dataframe = False):
    """
    Function to count the occurence of each word
    args:
        record_list type(list) <- list that holds the words to count
        dataframe (bool) <- true if converted into pd.Series
    returns
        dict or series
    """
    counter_journals = {}
    for i in record_list:
        # check the base state
        if i not in counter_journals: 
            # can also used the keys methods here
            counter_journals.update({i:1})
        else:
            counter_journals[i] += 1
            
    if dataframe:
        counter_dataframe = pd.Series(counter_journals).sort_values(ascending = False)
        return counter_dataframe
    return counter_journals

journal_dict = counter(journals_list)
journal_dataframe = counter(journals_list, dataframe = True)
journal_dataframe

International journal of molecular sciences                            100
Frontiers in immunology                                                 79
Frontiers in pharmacology                                               54
Scientific reports                                                      50
Nutrients                                                               45
                                                                      ... 
Journal of the College of Physicians and Surgeons--Pakistan : JCPSP      1
American journal of Alzheimer's disease and other dementias              1
Reviews in endocrine & metabolic disorders                               1
Endocrine connections                                                    1
Trends in neurosciences                                                  1
Length: 993, dtype: int64

In [97]:
# Question: How can we calculate from this the percentage?
frequency_dataframe = pd.DataFrame(journal_dataframe, columns = ["count"])
# Answer:
frequency_dataframe["frequency"] = (frequency_dataframe["count"]/ frequency_dataframe["count"].sum()) * 100

In [98]:
# Repurpose counter function for word frequency?
# thats why writing function/class methods is important
# We reduce workload, error rates and increase efficency
# Let's look at the Table for the abstracts
def records_tolist(records: list, preprocessed = False, stopwords: list = False) -> pd.Series:
    """ 
    Of importance here is that you understand what you are doing 
    Functions are available gettting rid of stopwords 
    Removing Punctuations
    Introducing Lemmtization and more 
    Which is all important for NLP
    
    args:
        records type(list of lists) <- holds the word records
        stopwords(list) <- default None, holds the 
    """
    if preprocessed:
        record_flatten = records
        
    else:
        record_flatten = [i.replace("(", "").replace(")","").split(" ") for i in records.tolist()]
        
    if stopwords:
        # check if is in stopword
        record_flatten = [t.lower() for i in record_flatten for t in i]
        record_flatten = [i for i in record_flatten if i not in stopwords]
        
    else:    
        #not removing stopwords
        record_flatten = [t for i in record_flatten for t in i]
        
    word_series = counter(record_flatten, dataframe = True)
    return word_series

word_series = records_tolist(records_dataframes["abstract"])
word_series.iloc[1:20]

the           28583
of            26763
in            17658
to            10257
with           7871
a              7243
were           7030
was            5592
by             4992
that           4757
The            4693
for            4356
is             3625
as             3233
on             3052
levels         2603
expression     2391
In             2191
patients       2189
dtype: int64

In [99]:
# Self created list of stopwords
# Usually not the way to go, but to gain understanding of how to process the data
stopwords = ["the","of","in","to",
             "with","were","a","was",
             "by","the", "the","that",
             "for","is","as","on","significant",
             "among","although","especially","kg",
             "km","mainly","ml","mm",
             "disease","significantly","obtained","mutation",
             "significant","quite","result","results","estimated",
             "interesting","conducted","associated","performed",
             "respectively","larger","genes","gene", "mutations",
             "related","expression","pattern","mutation","clc","identified",
             "suprisingly","preferentially","subsequently","far","little",
             "known","importantly","synonymous","skipping","father",
             "mother","pedigree","novo","rescues","rescued","restored",
             "exhibits","induce", "Background","Objective","Methods",
             "cells", "kinase","activation","protein",
             "be","at", "we", "p","from","or","after","treatment",
             "=", "are",".", "an"
            ]
word_series_no_stop = records_tolist(records_dataframes["abstract"], stopwords = stopwords)
word_series_no_stop.iloc[1:20]

levels          2630
this            2621
patients        2346
inflammatory    2231
il-6            2057
group           1735
study           1730
increased       1555
il-6,           1550
cell            1501
these           1405
which           1377
effects         1271
mice            1252
showed          1232
effect          1213
compared        1154
serum           1116
used            1103
dtype: int64

In [100]:
#Check with preprocessing
def preprocessing_list(liste,stopword_list):
    """ 
    preprocess the abstract list, remove stopwords, punctuations, numbers
    input: 
    liste: list of abstracts
    stopword_list: list of stopwords
    returns list of list <- holding the abstracts
    """
    processed_abstracts = []
    for i in liste:
        CUSTOM_FILTERS = [lambda x: x.lower(),remove_stopwords]
        a = preprocess_string(i, CUSTOM_FILTERS)
        no_integers = [x for x in a if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
        no_integers = [x for x in no_integers if x not in stopword_list]
        no_integers = [re.sub("[^A-Za-z0-9|-]","",x) for x in no_integers]
        processed_abstracts.append(no_integers)
    return processed_abstracts

# add the processed abstracts
records_dataframes["processed_abstract"] = preprocessing_list(records_dataframes["abstract"].tolist(),stopwords)
# and now count the preprocessed ones rather than the original abstracts
word_series_prep = records_tolist(records_dataframes["processed_abstract"], True, stopwords = stopwords)
word_series_prep[:20]

il-6            3770
levels          2992
patients        2955
group           2677
study           2398
inflammatory    2246
mice            1950
increased       1655
                1648
tnf-alpha       1602
cell            1515
effects         1497
inflammation    1476
cytokines       1390
effect          1328
showed          1233
activity        1231
serum           1187
compared        1177
il-1beta        1168
dtype: int64

In [101]:
# what would be the next step here?
# we have a set of authors
# lets check who is the most famous author in the field?
# if you want to play further with ith 
# read about Doc2Vec/Word2Vec/Sentence2Vec # this is way more specific than tf-idf
# read about Bert Transformers
# an lower level example
# check the umap literature

from sklearn.feature_extraction.text import TfidfVectorizer # let student search what might be the input here

records_dataframes["joined_abstracts"] = [ " ".join(i) for i in records_dataframes["processed_abstract"]]
vectorizer = TfidfVectorizer(min_df=5,stop_words='english')
X = vectorizer.fit_transform(records_dataframes["joined_abstracts"])
df_tf_idf = pd.DataFrame(
    data=X.todense().round(2),
    columns=vectorizer.get_feature_names_out()
)

df_tf_idf.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2849,2850,2851,2852,2853,2854,2855,2856,2857,2858
0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zeta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zinc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
# lets see if we can deduce something from this list
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from umap.umap_ import UMAP

# we could try to run here 
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
data_svd = svd.fit_transform(df_tf_idf)
data_svd = pd.DataFrame(data_svd, index = records_dataframes.index, columns = ["SV1", "SV2"])
reducer = UMAP(metric='hellinger')
data_umap = reducer.fit_transform(df_tf_idf)
data_umap = pd.DataFrame(data_umap, index = records_dataframes.index, columns = ["UMAP1", "UMAP2"])

#set the class of the data according to the literature
data_umap["class"] = records_dataframes["class"]
data_svd["class"] = records_dataframes["class"]

# retrieve the clusters here
fig, ax = plt.subplots(ncols = 2, nrows = 1, figsize = (10,4))
sns.despine() # removes upper and left axis
sns.scatterplot(data = data_svd, 
                x = "SV1",
                y= "SV2",
                hue = "class",
                ax = ax[0],
                palette = "tab20",
                linewidth = 0,
                alpha = 0.5)

sns.scatterplot(data = data_umap,
                x = "UMAP1",
                y = "UMAP2",
                hue = "class",
                ax = ax[1],
                palette = "tab20",
                linewidth = 0,
                alpha = 0.5)

plt.tight_layout()