# Filter data based on keywords

## Introduction
The aim of this notebook is to read a set of keywords and a set of scraped data and filter out all non-Covid-19 related entries/rows.

## Import libraries and set up defaults

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#%xmode Verbose
# Set global default figure size
plt.rc('figure', figsize=(20, 12)) # It's nice with figures that fill the whole space in width
# Show maximum of 8 rows when printing dataframes
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 8
# Show only 4 digits when printing floating point number
np.set_printoptions(precision=4, suppress=True)

## Read in Covid-19 keywords

In [2]:
key_words_df = pd.read_csv("COVID19_Proteins-SARS-COV-2_Protein_Names.tsv",
                           sep = '\t',
                           header = 0,
                           usecols = ['ORF','Gene','Gene2','Full_Name'] # Dropping: 'Function', Source and a url
                          )
key_words_df

Unnamed: 0,ORF,Gene,Gene2,Full_Name
0,ORF1AB,nsp1,,Host translation inhibitor nsp1
1,ORF1AB,nsp2,,Non-structural protein 2
2,ORF1AB,nps3,,Papain-like proteinase
3,ORF1AB,nps4,,Non-structural protein 4
...,...,...,...,...
20,ORF7A,,,Protein 7a
21,ORF8,,,Protein 8
22,N,,,Nucleoprotein
23,ORF10,,,3` UTR


## Create a unique Python list of keywords

In [3]:
key_words_list = (key_words_df['ORF']
                  .dropna() # Drop np.nan:s
                  .unique() # Filter all non-unique values
                  .tolist() # Make a python list
                 ) + \
key_words_df['Gene'].dropna().unique().tolist() + \
key_words_df['Gene2'].dropna().unique().tolist() + \
key_words_df['Full_Name'].dropna().unique().tolist()
print(key_words_list)

['ORF1AB', 'S', 'ORF3A', 'E', 'M', 'ORF6', 'ORF7A', 'ORF8', 'N', 'ORF10', 'nsp1', 'nsp2', 'nps3', 'nps4', 'nsp5', 'nsp6', 'nsp7', 'nsp8', 'nsp9', 'nsp10', 'RDRP', 'Hel', 'Exon', 'NendoU', "2'-O-MT", 'Spike', '3CL-PRO', 'Spike trimeric complex (S1, S2, S`)', 'Host translation inhibitor nsp1', 'Non-structural protein 2', 'Papain-like proteinase', 'Non-structural protein 4', '3C-like proteinase', 'Non-structural protein 6', 'Non-structural protein 7', 'Non-structural protein 8', 'Non-structural protein 9', 'Non-structural protein 10', 'RNA-Directed RNA Polymerase', 'Helicase', 'Proofreading exoribonuclease (Guanine-N7 methyltransferase)', 'Uridylate-specific endoribonuclease', "2'-O-methyltransferase", 'Spike surface glycoprotein (monomer)', 'Protein 3a', 'Envelope small membrane proteins', 'Membrane protein', 'Protein 6', 'Protein 7a', 'Protein 8', 'Nucleoprotein', '3` UTR']


## Dummy data for testing the filtering

In [4]:
data = {'col_1': ["Helicase", "This is a Protein 8 pdf abstract", "https://figshare.com/articles/nsp10/12162405", "Test"], 
        'col_2': ['a', 'b', 'c', 'd']}
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,col_1,col_2
0,Helicase,a
1,This is a Protein 8 pdf abstract,b
2,https://figshare.com/articles/nsp10/12162405,c
3,Test,d


## Initialise a pd.Series
The purpose of this Series is to hold a boolen index of if a certain keyword is found in the column.

In [5]:
search_column = "col_1"
falses = np.zeros(len(df[search_column]), dtype=bool) # https://stackoverflow.com/a/21174962
found = pd.Series(data = falses,
                  dtype = bool)

## Find all indexes with a match

In [6]:
for word in key_words_list:
    # Find out if the current search term can be found in the column
    cur_match = df["col_1"].str.contains(word) # https://stackoverflow.com/a/15333283
    # Join the found matches to one Series
    found = found | cur_match

  return func(self, *args, **kwargs)


## Select only rows with matches

In [7]:
df[found]

Unnamed: 0,col_1,col_2
0,Helicase,a
1,This is a Protein 8 pdf abstract,b
2,https://figshare.com/articles/nsp10/12162405,c
