In [None]:
import re
import string
import nltk

import pandas as pd
import numpy as np

from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
import gensim

#File import

In [None]:
#the files s2013 and s2020 are available here: https://drive.google.com/drive/folders/1wkQN4_byG8uVf6AduA8nsC3QHN5ZOXH6?usp=sharing
filepath = './c2013.csv'
c2013 = pd.read_csv(filepath,engine='python')
filepath2 = './c2020.csv'
c2020 = pd.read_csv(filepath2,engine='python')
filepath3 = './s2013.csv'
s2013 = pd.read_csv(filepath3,engine='python')
filepath4 = './s2020.csv'
s2020 = pd.read_csv(filepath4,engine='python',error_bad_lines=False)

#Data Cleaning and Extraction




In [None]:
nltk.download('punkt')
nltk.download('stopwords')

STOP_WORDS = stopwords.words()

def cleaning(text):
    """
    Convert to lowercase.
    Rremove URL links, special characters and punctuation.
    Tokenize and remove stop words.
    """
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)

    # removing the stop-words
    text_tokens = word_tokenize(text)
    tokens_without_sw = [
        word for word in text_tokens if not word in STOP_WORDS]
    filtered_sentence = (" ").join(tokens_without_sw)
    text = filtered_sentence

    return text
    
# df = df['TITLE'].apply(cleaning)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# counting unigrams and bigrams
def count_unigrams(df):
  word_count = Counter(" ".join(df).split()).most_common(15)
  word_frequency = pd.DataFrame(word_count, columns = ['Word', 'Frequency'])
  return word_frequency

def count_bigrams(df):
  words = nltk.tokenize.word_tokenize(df.to_string())
  bigram = list(nltk.bigrams(words))
  frequency = {}
  for item in bigram:
    if item in frequency:
        frequency[item] += 1
    else:
        frequency[item] = 1
  word_dist = nltk.FreqDist([' '.join(x) for x in bigram])
  bigram_frequency = pd.DataFrame(word_dist.most_common(15),columns=['Word', 'Frequency'])
  return bigram_frequency


In [None]:
pd.options.display.max_colwidth = 120
#choose the specific function, file, and column name to count unigrams/bigrams within your desired dataset
#E.g: here, we count the most frequency bigrams appearing in the abstratcs of articles cited in 2013
count_bigrams(c2013['ABSTRACT'].apply(cleaning))

Unnamed: 0,Word,Frequency
0,ozone o3,260
1,air pollution,229
2,ambient air,79
3,tropospheric ozone,78
4,air quality,72
5,air pollutants,68
6,exposure ozone,60
7,ozone exposure,52
8,air pollutant,49
9,health effects,43


#Data Exploration

In [None]:
# topic modeling
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    # convert all words into lower case, split by white space
    tokens = str(text).strip().lower().split()
    
    #  remove words with 1 or 2 letters (small words, punctuation)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in en_stop]
    return tokens
    

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# take out title texts in a list
c2013_titles = []
for index, row in c2013.iterrows():
    title_text = c2013['ABSTRACT']
    c2013_titles.append( title_text )
    
print(c2013_titles[0])

s2013_titles = []
for index, row in s2013.iterrows():
    title_text = row['TITLE']
    s2013_titles.append( title_text )
    
print(s2013_titles[0])

c2020_titles = []
for index, row in c2020.iterrows():
    title_text = row['TITLE']
    c2020_titles.append( title_text )
    
print(c2020_titles[0])

s2020_titles = []
for index, row in s2020.iterrows():
    title_text = row['TITLE']
    s2020_titles.append( title_text )
    
print(s2020_titles[0])

0       Medical Research Council (UK); Wellcome Trust.Water soluble antioxidant--ascorbate (AA), urate (UA), and reduced glu...
1       U.S. Environmental Protection Agency; California Air Resources Board.  #We conducted a prospective study of a cohort...
2       EPA's National Exposure Research Laboratory (NERL) has combined data from 12 U.S. studies related to human activitie...
3       Astra Draco AB [Lund, Sweden]. #Inhalation of ozone in normal subjects causes a neutrophilic inflammatory response i...
4       Exposure to ambient ozone (O3) is associated with increased exacerbations of asthma. We sought to determine whether ...
                                                                 ...                                                           
2231    Improving air quality by reducing ambient ozone (O(3)) will likely lower O(3) concentrations throughout the troposph...
2232                                                                                                    

In [None]:
# take out abstract texts in a list
c2013_abstracts = []
for index, row in c2013.iterrows():
    abstract_text = row['ABSTRACT']
    c2013_abstracts.append( abstract_text )

print (c2013_abstracts[0])

s2013_abstracts = []
for index, row in s2013.iterrows():
    abstract_text = row['ABSTRACT']
    s2013_abstracts.append( abstract_text )

print (s2013_abstracts[0])

c2020_abstracts = []
for index, row in c2020.iterrows():
    abstract_text = row['ABSTRACT']
    c2020_abstracts.append( abstract_text )

print (c2020_abstracts[0])

s2020_abstracts = []
for index, row in s2020.iterrows():
    abstract_text = row['ABSTRACT']
    s2020_abstracts.append( abstract_text )

print (s2020_abstracts[0])

Medical Research Council (UK); Wellcome Trust.Water soluble antioxidant--ascorbate (AA), urate (UA), and reduced glutathione (GSH)--consumption by ozone (O3) was investigated in a range of pulmonary epithelial lining fluid (ELF) models. Antioxidants were exposed individually and as a composite mixture, with and without human albumin to a range of ambient O3 concentrations: 0-1500 ppb using a continually mixed, interfacial exposure setup. We observed the following: (1) UA constituted the most o3-reactive substrate in each of the models examined. Reactivity hierarchies  in each were as follows: UA > AA >> GSH (individual antioxidant), UA > AA > GSH (composite antioxidant), and UA >> AA approximately equal to GSH (composite antioxidant + albumin). Consumption of  GSH as a pure antioxidant solution was associated with a 2:1 stoichiometric conversion of GSH to GSSG. This simplistic relationship was lost in the more complex models. (3) Consumption of antioxidants by O3 occurred without alter

In [None]:
#Use desired lists from above for topic modeling
#E.g: topic modeling on the abstracts of articles cited in 2020 
c2020_text_abstract = []
for abstract in c2020_abstracts:
    abstract = prepare_text_for_lda(abstract)
    c2020_text_abstract.append(abstract)
    
c2020_abstract_dictionary = corpora.Dictionary(c2020_text_abstract)
c2020_abstract_corpus = [c2020_abstract_dictionary.doc2bow(text) for text in c2020_text_abstract]

In [None]:
# train latent Dirichlet topic model
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(c2020_abstract_corpus, num_topics = NUM_TOPICS, id2word=c2020_abstract_dictionary, passes=15)

topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.007*"effects" + 0.003*"higher" + 0.003*"/><br" + 0.003*"exposure" + 0.003*"estimated" + 0.003*"obese"')
(1, '0.024*"ozone" + 0.017*"model" + 0.011*"emissions" + 0.009*"air" + 0.007*"quality" + 0.006*"o-3"')
(2, '0.024*"nan" + 0.007*"o-3" + 0.004*"ventilation" + 0.003*"used" + 0.003*"ozone" + 0.003*"filtration"')
(3, '0.018*"ozone" + 0.012*"lung" + 0.007*"mice" + 0.007*"exposure" + 0.006*"mice." + 0.006*"pulmonary"')
(4, '0.022*"air" + 0.015*"/><br" + 0.014*"pollution" + 0.014*"associated" + 0.011*"risk" + 0.010*"increase"')
(5, '0.015*"ozone" + 0.005*"rats" + 0.004*"exposure" + 0.004*"increased" + 0.004*"levels" + 0.003*"concentrations"')
(6, '0.007*"airway" + 0.007*"model" + 0.006*"ozone" + 0.005*"data" + 0.005*"exposure" + 0.004*"response"')
(7, '0.013*"air" + 0.008*"exposure" + 0.006*"data" + 0.006*"pollution" + 0.004*"high" + 0.004*"concentrations"')
(8, '0.025*"elevated" + 0.019*"co2" + 0.010*"effects" + 0.009*"soil" + 0.009*"o-3" + 0.009*"plant"')
(9, '0.013*"exposure" + 0