In [154]:
import pandas as pd
from tqdm import tqdm
import re
from nltk.corpus import stopwords
import preprocessing

In [46]:
path = 'data/cord-19/metadata.csv'
df = pd.read_csv(path, low_memory=False)

In [47]:
def drop_emptier_duplicates(df, col):
    """For all sets of rows with the same value of duplicate_column, keep only the one with the fewest NaNs"""
    duplicates_df = df[df[col].notnull() & df[col].duplicated(keep=False)]
    duplicates_df['nans'] = duplicates_df.apply(lambda x: x.isnull().sum(), axis=1)
    droplist = []
    print("Choosing rows to drop")
    for value in tqdm(duplicates_df[col].unique()):
        sets = duplicates_df[duplicates_df[col] == value]
        for i in sets.sort_values('nans', ascending=False).iloc[1:].index:
            droplist.append(i)
    print(f'dropping {len(droplist)} duplicate rows')
    return df.drop(index=droplist)

In [49]:
cols = ['cord_uid', 'doi', 'pdf_json_files', 'pmc_json_files']

In [50]:
for col in cols:
    df = drop_emptier_duplicates(df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates_df['nans'] = duplicates_df.apply(lambda x: x.isnull().sum(), axis=1)
  0%|          | 21/24171 [00:00<02:00, 200.85it/s]

Choosing rows to drop


100%|██████████| 24171/24171 [01:59<00:00, 201.60it/s]


dropping 28504 duplicate rows


100%|██████████| 1/1 [00:00<00:00, 548.56it/s]

Choosing rows to drop
dropping 1 duplicate rows



100%|██████████| 17/17 [00:00<00:00, 1148.79it/s]

Choosing rows to drop
dropping 18 duplicate rows



0it [00:00, ?it/s]

Choosing rows to drop
dropping 0 duplicate rows





In [66]:
# drop rows with no title
df = df[df.title.notnull()]

In [169]:
def drop_duplicate_titles(df):
    
    def clean_text(s):
        s = s.lower() # lowercase
        s = re.sub(r'[-–]', ' ', s) # replace hyphens with spaces
        s = re.sub(r'[^a-z|0-9|\s]', '', s) # remove anything that isn't alphanumeric or whitespace
        return s
    
    df['title_clean'] = df.title.apply(clean_text)
    duplicated = df[df.title_clean.duplicated(keep=False)]
    duplicated['title_len'] = duplicated.title_clean.apply(len)
    duplicated['num_nulls'] = duplicated.apply(lambda x: x.isnull().sum(), axis=1)
    duplicated = duplicated[duplicated.title_len >= 40]
    drop_index = []
    for title in tqdm(duplicated.title_clean.unique()):
        group = duplicated[duplicated.title_clean == title].sort_values('num_nulls')
        drop_index += group.iloc[1:].index.tolist()
    df = df.drop(index=drop_index)
    return df

In [162]:
df_.title = df_.title.apply(lambda x: re.sub(f'\W+$', '', x))

In [84]:
print(len(df))
df_ = drop_duplicate_titles(df)
print(len(df_))

508049


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicated['title_len'] = duplicated.title.apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicated['num_nulls'] = duplicated.apply(lambda x: x.isnull().sum(), axis=1)
100%|██████████| 66373/66373 [18:58<00:00, 58.32it/s]


440064


In [87]:
df_.drop(columns=['num_nulls'], inplace=True)

In [96]:
# drop rows where the title and abstract are fewer than 100 words together, and there's no text file

def short(row, minlength=100):
    title_abstract = str(row.title) + str(row.abstract)
    return len(title_abstract.split()) < minlength

mask1 = df_.pdf_json_files.isnull()
mask2 = df_.pmc_json_files.isnull()
mask3 = df_.apply(short, axis=1)

In [99]:
df_ = df_[~(mask1 & mask2 & mask3)]

In [101]:
df_.drop(columns=['title_len'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [107]:
# df_.to_csv('data/processed/metadata_clean.csv.gz', index=False, sep='\t', compression='gzip')

In [108]:
# df_test = pd.read_csv('data/processed/metadata_clean.csv.gz', sep='\t', compression='gzip')
# df_test.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [114]:
df_['abstract'].fillna('None', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [116]:
df_['search_text'] = preprocessing.make_search_documents(df_, 
                                                         stem=False, 
                                                         lemmatize=True, 
                                                         stopword_list=stopwords.words('english'))

100%|██████████| 338455/338455 [22:44<00:00, 248.09it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['search_text'] = preprocessing.make_search_documents(df_,


In [118]:
# df_.to_csv('data/processed/metadata_clean.csv.gz', index=False, sep='\t', compression='gzip')

In [136]:
df_['date'] = pd.to_datetime(df_.publish_time)
df_ = df_[df_.date.notnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['date'] = pd.to_datetime(df_.publish_time)


In [140]:
df_ = df_[df_.date >= '2010-01-01']

In [141]:
len(df_)

314838

In [161]:
len(df_)

314838

In [174]:
# df_.to_csv('data/processed/metadata_clean.csv.gz', index=False, sep='\t', compression='gzip')