In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input/cord-19-eda-parse-json-and-generate-clean-csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
from fastai.text import * 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from langdetect import detect

In [None]:
!pip install googletrans

In [None]:
from googletrans import Translator

[[](http://)](http://)

* https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv
* https://www.kaggle.com/danielwolffram/topic-modeling-finding-related-articles



First we load the data inside each of the csv files.


In [None]:
path = '/kaggle/input/cord-19-eda-parse-json-and-generate-clean-csv/'

In [None]:
pmc_custom_license_df = pd.read_csv(path + "clean_pmc.csv")
noncomm_use_df = pd.read_csv(path + "clean_noncomm_use.csv")
comm_use_df = pd.read_csv(path + "clean_comm_use.csv")
biorxiv = pd.read_csv(path + "biorxiv_clean.csv")

In [None]:
data = pd.concat([pmc_custom_license_df, noncomm_use_df, comm_use_df, biorxiv], axis =0)

We subset the dataset to have the paper id and text of the document only

In [None]:
covid_papers = data[data.text.str.contains('COVID-19|SARS-CoV-2|2019-nCov|SARS Coronavirus 2|2019 Novel Coronavirus')][['paper_id', 'text']]

In [None]:
data = covid_papers

To detect the language of the papers we use the function detect from the langdetect library

In [None]:
data['language'] = data['text'].map(detect)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['language'].value_counts()

Four papers are written in Spanish.

In [None]:
indices_to_translate = data[data['language']=='es']['text'].index

In [None]:
translator = Translator()

In [None]:
for i in indices_to_translate:

    if (len(data.loc[i, 'text']) > 3900):
        data.loc[i, 'text'] = data.loc[i, 'text'][:3900]
    paper = translator.translate(data.loc[i, 'text'])
    data.loc[i, 'text'] = paper.text


In [None]:
data.loc[indices_to_translate, 'text']

The papers are now translated

In [None]:
data.to_csv('data_covid19_papers.csv')

# Language model data



* Training on 90% of the data and validating on 10%
* The function TextLMDataBunch creates a TextDataBunch suitable for training a language model.


In [None]:
train, validation = train_test_split(data, test_size=0.1, random_state=42, shuffle= True)

In [None]:
data_lm = TextLMDataBunch.from_df('.',  train, validation,
                                  label_cols='text')

Saving the results to save time in the future.

In [None]:
data_lm.save('data_lm_export.pkl')

In [None]:
data_lm = load_data('.', 'data_lm_export.pkl')

Now making the language model learn. It has already learnt some features of basic English using Wikipedia and now I am training it to learn the peculiarities of Covid-19 papers and articles.

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

Training for 4 epochs

In [None]:
learn.fit_one_cycle(4, 2e-2) 

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-4, 1e-3))

In [None]:
learn.save_encoder('ft_enc')


# Inference


In [None]:
np.random.seed(42)

In [None]:
learn.predict("risks factors", n_words=100, temperature=0.1)

In [None]:
learn.predict("smoking", n_words=100, no_unk=True, temperature=0.1)

In [None]:
learn.predict("pre-existing pulmonary disease", n_words=50, no_unk=True, temperature=0.05)

In [None]:
learn.predict("Co-infections", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("comorbidities", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("Neonates", n_words=50, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("pregnant women", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("Socio-economic factors", n_words=100, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("behavioral factors", n_words=80, no_unk=True, temperature=0.1)

In [None]:
learn.predict("Transmission dynamics of the virus", n_words=50, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("basic reproductive number", n_words=80, no_unk=True, temperature=0.1)

In [None]:
learn.predict("incubation period", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("serial interval", n_words=50, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("environmental factors", n_words=50, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("modes of transmission", n_words=50, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("Severity of disease", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("risk of fatality among symptomatic hospitalized patients", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("high-risk patient", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("Susceptibility of populations", n_words=50, no_unk=True, temperature=0.1)

In [None]:
learn.predict("Public health mitigation measures", n_words=50, no_unk=True, temperature=0.1) 

In [None]:
learn.predict("What do we know about COVID-19 risk factors?", n_words=100, no_unk=True, temperature=0.1)

In [None]:
data.head()