# Description

This notebook analyzes overlap between:
- the list of Human genes, targeted by SARS-COV-2 viral miRNAs (as predicted by our pipeline) and
- the list of Human genes, downregulated in people infected with SARS-COV-2 (as discovered in gene expression datasets).

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

# Load predicted genes

In [3]:
targets_basedir = Path('../targets/non-conserved-region')

mirnafold_path = targets_basedir / 'mirnafold-pipeline/mirnafold-targets/score-more-95-targets-less-800.csv'
vmir_path = targets_basedir / 'vmir-pipeline/vmir-targets/score-more-95-targets-less-800.csv'

In [30]:
mirnafold_targets = pd.read_csv(mirnafold_path)
vmir_targets = pd.read_csv(vmir_path)

'mirnafold:',len(mirnafold_targets), 'vmir:', len(vmir_targets)

('mirnafold:', 50, 'vmir:', 44)

In [54]:
# merge target predictions from both pipelines.
# in case of duplicates, keep one with highest score (confidence)
predicted_genes_df = pd.merge(mirnafold_targets, vmir_targets, how='outer')
predicted_genes_df = merged.sort_values('Target Score', ascending=False).drop_duplicates('Gene Symbol', keep='first') 
predicted_genes_df

Unnamed: 0,Target Rank,Target Score,Sequence,Gene Symbol,Gene Description,Strand,# of predicted targets,Seed
67,2,100,AUUGCCAUAGUAAUGGUGACAA,ZNF493,zinc finger protein 493,5',653,UUGCCAU
66,1,100,AUUGCCAUAGUAAUGGUGACAA,ZNF99,zinc finger protein 99,5',653,UUGCCAU
21,1,100,AUAGUGUUUAUAACACUUUGCU,MAP4K3,mitogen-activated protein kinase kinase kinase...,5',656,UAGUGUU
84,3,99,ACAAUUAUGCUUUGCUGUAUGA,TMEM56,transmembrane protein 56,3',667,CAAUUAU
63,1,99,UGAUUCUCUUCCUGUUCCAAGC,FRK,fyn related Src family tyrosine kinase,3',476,GAUUCUC
...,...,...,...,...,...,...,...,...
58,9,96,UCAGCAACACAGUUGCUGAUUC,SLK,STE20 like kinase,5',762,CAGCAAC
59,10,96,UCAGCAACACAGUUGCUGAUUC,LARP4B,La ribonucleoprotein domain family member 4B,5',762,CAGCAAC
60,11,96,UCAGCAACACAGUUGCUGAUUC,IL4R,interleukin 4 receptor,5',762,CAGCAAC
61,12,96,UCAGCAACACAGUUGCUGAUUC,TMEM178B,transmembrane protein 178B,5',762,CAGCAAC


In [28]:
predicted_genes = sorted(list(predicted_genes_df['Gene Symbol']))
print('Predicted genes:', predicted_genes)

Predicted genes: ['AADACL3', 'ARHGAP32', 'ASH1L', 'ATF7', 'BRI3BP', 'BTAF1', 'C15orf40', 'C2CD2', 'CADM2', 'CDH13', 'CFL2', 'CMTM4', 'CPEB2', 'CTLA4', 'CUL3', 'CXCL5', 'DICER1', 'DMTF1', 'DOCK4', 'EIF5A2', 'ERO1B', 'FAM92B', 'FNDC3B', 'FRK', 'FRS2', 'GALNT7', 'GATM', 'GRIA3', 'HOOK1', 'HSBP1', 'IL4R', 'JADE1', 'KCTD3', 'KIAA1324L', 'KMT2A', 'LARP4B', 'LHFPL5', 'LHX9', 'LRAT', 'LRRFIP1', 'MAP4K3', 'MOSMO', 'MSTN', 'MTX3', 'MUC15', 'NELL1', 'PCDH19', 'PDE10A', 'PFN2', 'PGM3', 'PIAS2', 'PKN2', 'PNPT1', 'RAB14', 'RASGEF1A', 'RBM41', 'RCN2', 'RD3', 'RGL2', 'RHOQ', 'RNF11', 'RUFY3', 'SAMD8', 'SATB2', 'SESTD1', 'SH2B3', 'SLC16A7', 'SLK', 'SOS1', 'SPAG9', 'STRN3', 'TMEM178B', 'TMEM181', 'TMEM56', 'TNFAIP8L3', 'TNFRSF1A', 'TSC22D2', 'UBE2D3', 'UBP1', 'VPS13C', 'YAE1', 'ZDHHC21', 'ZFAND4', 'ZFX', 'ZIC2', 'ZNF117', 'ZNF138', 'ZNF385B', 'ZNF493', 'ZNF714', 'ZNF728', 'ZNF730', 'ZNF99']


# Load differentially expressed genes (DEGs)

The DEGs are from supplementary materials at: https://academic.oup.com/cid/article/71/16/2052/5822600#supplementary-data

*The reported DEGs are all **down**-regulated*. They come from two conditions: severe and mild. 

In [31]:
!pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 2.0 MB/s eta 0:00:01
[?25hCollecting lxml>=2.3.2
  Downloading lxml-4.6.3-cp38-cp38-macosx_10_9_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 11.9 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25ldone
[?25h  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184600 sha256=6762ccf648704e83d4a6da8af60a78ce33ef17a2e0dd1e363f520258e41b5c32
  Stored in directory: /Users/macbook/Library/Caches/pip/wheels/32/b8/b2/c4c2b95765e615fe139b0b17b5ea7c0e1b6519b0a9ec8fb34d
Successfully built python-docx
Installing collected packages: lxml, python-docx
Successfully installed lxml-4.6.3 python-docx-0.8.11


In [44]:
from docx.api import Document

# adopted from https://stackoverflow.com/a/27862205/7948839
def parse_table(path_to_doc, table_id):
    document = Document(path_to_doc)
    
    if table_id > len(document.tables):
        raise f'There are only {len(document.tables)} tables in the document, but you asked for table {table_id}.'
    
    table = document.tables[table_id-1]

    rows = [] # a list of dictionaries containing each row's data.

    keys = None
    for i, row in enumerate(table.rows):
        text = (cell.text for cell in row.cells)

        # Establish the mapping based on the first row
        # headers; these will become the keys of our dictionary
        if i == 0:
            keys = tuple(text)
            continue

        # Construct a dictionary for this row, mapping
        # keys to values for this row
        row_data = dict(zip(keys, text))
        rows.append(row_data)
        
    return pd.DataFrame(data)

In [65]:
degs_doc_path = 'ciaa462_suppl_supplymentary_materials.docx'

severe_degs = parse_table(degs_doc_path, 5)
severe_degs = severe_degs.rename(columns={'A1_Label': 'Gene Symbol'})
severe_degs.drop(columns=['Degree', 'Betweenness'], inplace=True)

mild_degs = parse_table(degs_doc_path, 6)
mild_degs = mild_degs.rename(columns={'B1_Label': 'Gene Symbol'})
mild_degs.drop(columns=['Degree', 'Betweenness'], inplace=True)

degs_df = pd.merge(severe_degs, mild_degs, how='outer', on='Gene Symbol')
conditions_mapping = {'Expression_x': 'Expression_severe', 'Expression_y': 'Expression_mild'}
degs_df = degs_df.rename(columns=conditions_mapping)

# Check overlap

In [66]:
pd.merge(predicted_genes_df, degs_df, how='inner', on='Gene Symbol')

Unnamed: 0,Target Rank,Target Score,Sequence,Gene Symbol,Gene Description,Strand,# of predicted targets,Seed,Expression_severe,Expression_mild
0,7,98,AUAGUGUUUAUAACACUUUGCU,SOS1,SOS Ras/Rac guanine nucleotide exchange factor 1,5',656,UAGUGUU,-1.85018,
1,2,96,UCAUUACUUCAGGUGAUGGCAC,PIAS2,protein inhibitor of activated STAT 2,5',435,CAUUACU,-2.38271,-2.29114
2,21,96,AUAGUGUUUAUAACACUUUGCU,TNFRSF1A,TNF receptor superfamily member 1A,5',656,UAGUGUU,-1.19104,-1.69855
