# Description

This notebook analyzes overlap between:
- the list of Human genes, targeted by SARS-COV-2 viral miRNAs (as predicted by our pipeline) and
- the list of Human genes, downregulated in people infected with SARS-COV-2 (as discovered in gene expression datasets).

We consider two sets of target genes:
- with stricter filtering (score > 95 AND number of targets per miRNA < 800)
- with laxer filtering (score > 95)

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

# Load predicted genes

In [2]:
targets_basedir = Path('../targets/non-conserved-region')

mirnafold_path = targets_basedir / 'mirnafold-pipeline/mirnafold-targets/score-more-95-targets-less-800.csv'
mirnafold_path_lax = targets_basedir / 'mirnafold-pipeline/mirnafold-targets/score-more-95.csv'
vmir_path = targets_basedir / 'vmir-pipeline/vmir-targets/score-more-95-targets-less-800.csv'
vmir_path_lax = targets_basedir / 'vmir-pipeline/vmir-targets/score-more-95.csv'

In [5]:
mirnafold_targets = pd.read_csv(mirnafold_path)
mirnafold_targets_lax = pd.read_csv(mirnafold_path_lax)
vmir_targets = pd.read_csv(vmir_path)
vmir_targets_lax = pd.read_csv(vmir_path_lax)

print('mirnafold:',len(mirnafold_targets), 'vmir:', len(vmir_targets))
print('mirnafold (laxer filtering):',len(mirnafold_targets_lax), 'vmir laxer filtering:', len(vmir_targets_lax))

mirnafold: 50 vmir: 44
mirnafold (laxer filtering): 399 vmir laxer filtering: 102


In [10]:
# merge target predictions from both pipelines.
# in case of duplicates, keep one with highest score (confidence)
predicted_genes_df = pd.merge(mirnafold_targets, vmir_targets, how='outer')
predicted_genes_df = predicted_genes_df.sort_values('Target Score', ascending=False).drop_duplicates('Gene Symbol', keep='first') 
predicted_genes_df

Unnamed: 0,Target Rank,Target Score,Sequence,Gene Symbol,Gene Description,Strand,# of predicted targets,Seed
67,2,100,AUUGCCAUAGUAAUGGUGACAA,ZNF493,zinc finger protein 493,5',653,UUGCCAU
66,1,100,AUUGCCAUAGUAAUGGUGACAA,ZNF99,zinc finger protein 99,5',653,UUGCCAU
21,1,100,AUAGUGUUUAUAACACUUUGCU,MAP4K3,mitogen-activated protein kinase kinase kinase...,5',656,UAGUGUU
84,3,99,ACAAUUAUGCUUUGCUGUAUGA,TMEM56,transmembrane protein 56,3',667,CAAUUAU
63,1,99,UGAUUCUCUUCCUGUUCCAAGC,FRK,fyn related Src family tyrosine kinase,3',476,GAUUCUC
...,...,...,...,...,...,...,...,...
58,9,96,UCAGCAACACAGUUGCUGAUUC,SLK,STE20 like kinase,5',762,CAGCAAC
59,10,96,UCAGCAACACAGUUGCUGAUUC,LARP4B,La ribonucleoprotein domain family member 4B,5',762,CAGCAAC
60,11,96,UCAGCAACACAGUUGCUGAUUC,IL4R,interleukin 4 receptor,5',762,CAGCAAC
61,12,96,UCAGCAACACAGUUGCUGAUUC,TMEM178B,transmembrane protein 178B,5',762,CAGCAAC


In [6]:
# merge target predictions from both pipelines.
# in case of duplicates, keep one with highest score (confidence)
predicted_genes_lax_df = pd.merge(mirnafold_targets_lax, vmir_targets_lax, how='outer')
predicted_genes_lax_df = predicted_genes_lax_df.sort_values('Target Score', ascending=False).drop_duplicates('Gene Symbol', keep='first') 
predicted_genes_lax_df

Unnamed: 0,Target Rank,Target Score,Sequence,Gene Symbol,Gene Description
0,1,100,AUUCUAAUUUCUCCACGUCUUU,ICA1L,islet cell autoantigen 1 like
110,14,100,AUUCAUUUGUAAUUAGAGGUGA,TEAD1,TEA domain transcription factor 1
108,12,100,AUUCAUUUGUAAUUAGAGGUGA,VCAN,versican
107,11,100,AUUCAUUUGUAAUUAGAGGUGA,RPRD1B,regulation of nuclear pre-mRNA domain containi...
106,10,100,AUUCAUUUGUAAUUAGAGGUGA,OXR1,oxidation resistance 1
...,...,...,...,...,...
294,198,96,AUUCAUUUGUAAUUAGAGGUGA,RAB22A,"RAB22A, member RAS oncogene family"
295,199,96,AUUCAUUUGUAAUUAGAGGUGA,ADAMTS6,ADAM metallopeptidase with thrombospondin type...
296,200,96,AUUCAUUUGUAAUUAGAGGUGA,DCUN1D5,defective in cullin neddylation 1 domain conta...
297,201,96,AUUCAUUUGUAAUUAGAGGUGA,TMEM233,transmembrane protein 233


# Load differentially expressed genes (DEGs)

These are some significant DEGs in patients with COVID from supplementary materials at: https://academic.oup.com/cid/article/71/16/2052/5822600#supplementary-data

- The reported DEGs are all **down**-regulated
- They come from two conditions: severe and mild

In [16]:
!pip install python-docx openpyxl # xlrd 

Collecting openpyxl
  Downloading openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
[K     |████████████████████████████████| 243 kB 2.0 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7


In [7]:
from docx.api import Document

# adopted from https://stackoverflow.com/a/27862205/7948839
def parse_table(path_to_doc, table_id):
    document = Document(path_to_doc)
    
    if table_id > len(document.tables):
        raise f'There are only {len(document.tables)} tables in the document, but you asked for table {table_id}.'
    
    table = document.tables[table_id-1]

    rows = [] # a list of dictionaries containing each row's data.

    keys = None
    for i, row in enumerate(table.rows):
        text = (cell.text for cell in row.cells)

        # Establish the mapping based on the first row
        # headers; these will become the keys of our dictionary
        if i == 0:
            keys = tuple(text)
            continue

        # Construct a dictionary for this row, mapping
        # keys to values for this row
        row_data = dict(zip(keys, text))
        rows.append(row_data)
        
    return pd.DataFrame(rows)

In [20]:
degs_doc_path = 'ciaa462_suppl_supplymentary_materials.docx' # download from https://academic.oup.com/cid/article/71/16/2052/5822600#supplementary-data

severe_degs = parse_table(degs_doc_path, 5)
severe_degs = severe_degs.rename(columns={'A1_Label': 'Gene Symbol'})
severe_degs.drop(columns=['Degree', 'Betweenness'], inplace=True)

mild_degs = parse_table(degs_doc_path, 6)
mild_degs = mild_degs.rename(columns={'B1_Label': 'Gene Symbol'})
mild_degs.drop(columns=['Degree', 'Betweenness'], inplace=True)

degs_df = pd.merge(severe_degs, mild_degs, how='outer', on='Gene Symbol')
conditions_mapping = {'Expression_x': 'Expression_severe', 'Expression_y': 'Expression_mild'}
degs_df = degs_df.rename(columns=conditions_mapping)

# Check overlap

### Stricter miRNA filtering

In [11]:
pd.merge(predicted_genes_df, degs_df, how='inner', on='Gene Symbol')

Unnamed: 0,Target Rank,Target Score,Sequence,Gene Symbol,Gene Description,Strand,# of predicted targets,Seed,Expression_severe,Expression_mild
0,7,98,AUAGUGUUUAUAACACUUUGCU,SOS1,SOS Ras/Rac guanine nucleotide exchange factor 1,5',656,UAGUGUU,-1.85018,
1,2,96,UCAUUACUUCAGGUGAUGGCAC,PIAS2,protein inhibitor of activated STAT 2,5',435,CAUUACU,-2.38271,-2.29114
2,21,96,AUAGUGUUUAUAACACUUUGCU,TNFRSF1A,TNF receptor superfamily member 1A,5',656,UAGUGUU,-1.19104,-1.69855


### Laxer miRNA filtering

In [12]:
pd.merge(predicted_genes_lax_df, degs_df, how='inner', on='Gene Symbol')

Unnamed: 0,Target Rank,Target Score,Sequence,Gene Symbol,Gene Description,Expression_severe,Expression_mild
0,31,100,AUUCAUUUGUAAUUAGAGGUGA,PTEN,phosphatase and tensin homolog,-1.29593,-1.15863
1,16,98,AUGAAGAAGGUAACAUGUUCAA,AHR,aryl hydrocarbon receptor,-1.35261,-2.21396
2,7,98,AUAGUGUUUAUAACACUUUGCU,SOS1,SOS Ras/Rac guanine nucleotide exchange factor 1,-1.85018,
3,146,97,AUUCAUUUGUAAUUAGAGGUGA,PIAS2,protein inhibitor of activated STAT 2,-2.38271,-2.29114
4,21,96,AUAGUGUUUAUAACACUUUGCU,TNFRSF1A,TNF receptor superfamily member 1A,-1.19104,-1.69855


# Confusing dataset ahead (work in progress)

The dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE162911

It should have 9 COVID-infected and 3 non-infected donors.

Which donors are which? I checked all files but didn't understand it. From the sheet "ROI_IDs_across_modalities" within the excel file, it looks as if there's data on COVID antigen status only for different autopsy samples within Trachea and LUL (lungs, I assume?) from the same donor (D20).

**But where is data for each individual donor in general?**

**Note:** it's probably super obvious and I'm just missing some paragraph in some file where everything is explained.

____

When we figure the question above out, we can use the dataframe below to find DEGs. 

The table already normalized expression counts, so I'll just calculate the log-fold-change values and take the ones that are significant (smaller than -1).

In [19]:
pd.read_excel('GSE162911_Nanostring_Count_Matrices.xlsx', sheet_name='WTA_Q3Norm_TargetCountMatrix')

Unnamed: 0,Gene,DSP.1001250001619.A01,DSP.1001250001619.A02,DSP.1001250001619.A03,DSP.1001250001619.A04,DSP.1001250001619.A05,DSP.1001250001619.A06,DSP.1001250001619.A07,DSP.1001250001619.A08,DSP.1001250001619.A09,...,DSP.1001660004763.H03,DSP.1001660004763.H04,DSP.1001660004763.H05,DSP.1001660004763.H06,DSP.1001660004763.H07,DSP.1001660004763.H08,DSP.1001660004763.H09,DSP.1001660004763.H10,DSP.1001660004763.H11,DSP.1001660004763.H12
0,A1BG,38.612707,30.483716,27.943406,25.046080,38.612707,26.201479,9.653177,27.311427,29.958134,...,27.133253,7.020492,24.002493,25.741804,20.685379,30.714653,21.061476,15.445083,20.322477,30.890165
1,A1CF,38.612707,37.596583,33.024025,34.438360,28.959530,30.338555,9.653177,28.253200,31.289607,...,32.351187,28.081968,38.612707,12.870902,23.443429,32.469776,31.592214,23.167624,32.515963,27.028895
2,A2M,38.612707,143.273464,93.991457,181.584080,24.132942,55.161009,453.699302,153.509053,79.888358,...,425.783359,224.655747,314.119586,300.321051,372.336813,287.840176,222.900624,417.017231,481.642708,440.184855
3,A2ML1,38.612707,24.386973,18.798291,21.915320,19.306353,27.580505,19.306353,24.486107,29.292398,...,15.653800,10.530738,31.307600,17.161203,39.991732,28.081968,21.061476,15.445083,30.483716,3.861271
4,A3GALT2,38.612707,22.354725,32.007901,31.307600,28.959530,35.854656,28.959530,31.078520,39.944179,...,21.915320,21.061476,25.046080,17.161203,19.306353,30.714653,25.449284,38.612707,16.257982,38.612707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18358,ZYG11A,38.612707,23.370849,18.798291,27.133253,43.439295,38.612707,28.959530,23.544333,29.292398,...,30.264013,21.061476,26.089667,8.580601,35.854656,28.081968,35.980022,30.890165,32.515963,30.890165
18359,ZYG11B,38.612707,23.370849,36.072397,44.874227,38.612707,44.128808,28.959530,30.136747,41.941388,...,24.002493,35.102461,35.481947,34.322406,30.338555,79.858098,16.673669,7.722541,26.419220,34.751436
18360,ZYX,38.612707,89.418899,64.015803,82.443346,48.265883,66.193211,222.023063,101.711520,65.907896,...,112.707360,70.204921,84.530520,94.386616,96.531766,80.735659,72.837606,92.670496,105.676881,77.225413
18361,ZZEF1,38.612707,29.467592,27.435344,37.569120,38.612707,24.822454,28.959530,32.962067,32.621080,...,49.048573,52.653691,34.438360,17.161203,52.402959,32.469776,40.367830,61.780331,38.612707,27.028895
