First we need to import the necessary modules.

In [1]:
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import random
#from beeswarm import beeswarm
#from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind
#from matplotlib.patches import Ellipse,Patch,Arrow
#from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('png')
#from IPython.display import HTML, display
#import tabulate
#from decimal import Decimal
import warnings
warnings.filterwarnings('ignore') #in order to suppress warnings output of t-test if one sample is 0
pd.set_option('display.max_rows', 1000)

The we read sample files and extract separate data containers for the 4 tissues in question.

In [2]:
sampleinfo=pd.read_csv('sample_info_consensus.csv',sep=',')
sampleinfo=sampleinfo.query('paper == "fromm" or paper == "schee" or paper == "selitsky" or paper == "neerincx"')
countmat=pd.read_csv('consensus.rpm_uniq_seqs_correct_names.csv',sep=',')
countmat.rename(columns = {'Unnamed: 0':'mirna'}, inplace = True)
liver_normal=countmat[[row[0] for row in sampleinfo.values if row[3]=='liver' and row[2]=='normal']]
liver_meta=countmat[[row[0] for row in sampleinfo.values if row[3]=='liver' and row[2]=='metastasis']]
crc_normal=countmat[[row[0] for row in sampleinfo.values if row[3]=='colorect' and row[2]=='normal' and row[4]!='schee']]
crc_tumor=countmat[[row[0] for row in sampleinfo.values if row[3]=='colorect' and row[2]=='tumor']]
liver_normal.index=liver_meta.index=crc_normal.index=crc_tumor.index=list(countmat['mirna'])

For each miRNA, certain values are calculated such as t-test relevance values between different tissue types, fold changes between average values and count of samples. This is later used to filter the data. Also, miRNAs that are known to be tissue-related are excluded.

In [5]:
mirnanamelist=[]
mirnadatalist=[]
tissuemirna=["Hsa-Mir-122_5p ","Hsa-Mir-126_5p ","Hsa-Mir-126_3p ","Hsa-Mir-144_5p ","Hsa-Mir-144_3p ","Hsa-Mir-486_5p ",
             "Hsa-Mir-143_3p ","Hsa-Mir-145_5p ","Hsa-Mir-150_5p ","Hsa-Mir-142-P1_5p ","Hsa-Mir-223_3p "]
for i in range(len(countmat.values)):
    s1,p1=ttest_ind(crc_normal.values[i],crc_tumor.values[i]) #p values are calculated
    s2,p2=ttest_ind(crc_tumor.values[i],liver_meta.values[i])
    s3,p3=ttest_ind(crc_normal.values[i],liver_normal.values[i])
    s4,p4=ttest_ind(liver_meta.values[i],liver_normal.values[i])
    fc_cm_cb=np.mean(crc_tumor.values[i])/np.mean(crc_normal.values[i]) # fold changes are calculated
    if fc_cm_cb<1:fc_cm_cb=-1/(fc_cm_cb+0.00001)                        # fold change absolute value can not be smaller than 1
    fc_lm_cm=np.mean(liver_meta.values[i])/np.mean(crc_tumor.values[i])
    if fc_lm_cm<1:fc_lm_cm=-1/(fc_lm_cm+0.00001)
    fc_lb_cb=np.mean(liver_normal.values[i])/np.mean(crc_normal.values[i])
    if fc_lb_cb<1:fc_lb_cb=-1/(fc_lb_cb+0.00001)
    fc_lb_lm=np.mean(liver_normal.values[i])/np.mean(liver_meta.values[i])
    if fc_lb_lm<1:fc_lb_lm=-1/(fc_lb_lm+0.00001)
    n_cb=len(crc_normal.values[i])   #number of measurements
    n_cm=len(crc_tumor.values[i])
    n_lb=len(liver_normal.values[i])
    n_lm=len(liver_meta.values[i])
    totcounts=np.mean(liver_normal.values[i])+np.mean(liver_meta.values[i])+np.mean(crc_normal.values[i])+np.mean(crc_tumor.values[i])
    tm=False
    if countmat['mirna'][i] in tissuemirna:tm=True
    mirnadatalist.append([p1,p2,p3,p4,fc_cm_cb,fc_lm_cm,fc_lb_cb,fc_lb_lm,n_cb,n_cm,n_lb,n_lm,totcounts,tm])
    mirnanamelist.append(countmat['mirna'][i])
mirnalist=pd.DataFrame(data=mirnadatalist,index=mirnanamelist,columns=[ 
                                               'p_cm_cb',
                                               'p_cm_lm',
                                               'p_cb_lb',
                                               'p_lm_lb',
                                               'fc_cm_cb',
                                               'fc_lm_cm',
                                               'fc_lb_cb',
                                               'fc_lb_lm',
                                               'N_cb',
                                               'N_cm',
                                               'N_lb',
                                               'N_lm',
                                               'total_counts',
                                               'tissue_mirna'])

This is what the data collection looks like for one example miRNA.

In [6]:
mirnalist.T['Hsa-Mir-122_5p ']

p_cm_cb            0.227864
p_cm_lm         4.72873e-07
p_cb_lb         1.28378e-15
p_lm_lb         7.69653e-13
fc_cm_cb           -2.29099
fc_lm_cm            243.423
fc_lb_cb             4146.9
fc_lb_lm            39.0297
N_cb                     24
N_cm                    101
N_lb                     24
N_lm                     19
total_counts         140249
tissue_mirna           True
Name: Hsa-Mir-122_5p , dtype: object

The following lists consist of miRNA where the total average count of all tissues is higher than 400

We are interested in the miRNA where the liver malign-colon malign ratio and the liver benign-colon malign ratio are both larger than 1 or both smaller than -1. In order to exclude cases that center around FC=1, higher cutoffs of 1.5 and -1.5, respectively, have been chosen.

As an additional filter, first come the ones where the change of colon malign to liver malign follow the same trend as liver benign to liver malign.

In [10]:
fco=1.5 #cutoff
query1='total_counts > 400 and tissue_mirna==False and ((fc_lm_cm < %f and fc_lb_cb > 1) or (fc_lm_cm > %f and fc_lb_cb < -1) or (fc_lm_cm < -1 and fc_lb_cb > %f) or (fc_lm_cm > 1 and fc_lb_cb < %f)) and fc_lm_cm*fc_lb_lm>0' % (-fco,fco,fco,-fco)
mirnalist.query(query1)

Unnamed: 0,p_cm_cb,p_cm_lm,p_cb_lb,p_lm_lb,fc_cm_cb,fc_lm_cm,fc_lb_cb,fc_lb_lm,N_cb,N_cm,N_lb,N_lm,total_counts,tissue_mirna
Hsa-Mir-103-P1-2_3p,9.244116e-12,0.116527,4.348665e-08,0.481822,2.232835,-1.155588,1.810368,-1.067277,24,101,24,19,13225.097042,False
Hsa-Mir-103-P3_3p,1.060049e-11,0.102521,1.784214e-07,0.325442,2.233893,-1.163499,1.751143,-1.09639,24,101,24,19,12807.600724,False
Hsa-Mir-19-P1_3p,4.418713e-13,0.731784,0.002403434,0.000907,5.157602,-1.046291,2.105632,-2.340982,24,101,24,19,1008.450246,False
Hsa-Mir-192-P2_5p,1.2379269999999998e-19,0.015509,2.809846e-06,0.305364,-2.626522,1.343741,-1.765132,1.107368,24,101,24,19,178860.775452,False
Hsa-Mir-214_3p,2.029039e-05,0.915933,0.0001038462,0.690395,1.990722,-1.015026,1.827285,-1.073292,24,101,24,19,510.04667,False
Hsa-Mir-31_5p,0.01439426,0.053782,0.2005683,0.020324,111.85562,-8.027668,1.330055,-10.47414,24,101,24,19,546.766822,False
Hsa-Mir-374-P1_3p,1.45771e-11,0.07596,0.0001371234,0.449178,2.380562,-1.202316,1.773998,-1.116086,24,101,24,19,726.136701,False


Now come the ones where the change of colon malign to liver malign do not follow the same trend as liver benign to liver malign.

In [11]:
fco=1.5 #cutoff
query2='total_counts > 400 and tissue_mirna==False and ((fc_lm_cm < %f and fc_lb_cb > 1) or (fc_lm_cm > %f and fc_lb_cb < -1) or (fc_lm_cm < -1 and fc_lb_cb > %f) or (fc_lm_cm > 1 and fc_lb_cb < %f)) and fc_lm_cm*fc_lb_lm<0' % (-fco,fco,fco,-fco)
mirnalist.query(query2)

Unnamed: 0,p_cm_cb,p_cm_lm,p_cb_lb,p_lm_lb,fc_cm_cb,fc_lm_cm,fc_lb_cb,fc_lb_lm,N_cb,N_cm,N_lb,N_lm,total_counts,tissue_mirna
Hsa-Mir-10-P1a_5p,0.1123321,0.0003522856,1.377377e-08,1.439242e-06,1.287759,1.715797,-2.583571,-5.708308,24,101,24,19,320447.827049,False
Hsa-Mir-10-P2a_5p,0.07346946,0.197132,1.423196e-07,1.653406e-05,1.909509,-1.626276,2.706157,2.304796,24,101,24,19,5056.673341,False
Hsa-Mir-10-P2b_5p,0.2364094,0.9676175,4.416225e-18,6.56269e-15,1.260039,-1.008161,11.02163,8.818529,24,101,24,19,1198.98417,False
Hsa-Mir-10-P2c_5p,0.05423693,0.5658453,0.006317651,0.06301471,-1.364121,1.114067,-1.791083,-1.462747,24,101,24,19,5997.977207,False
Hsa-Mir-130-P1a_3p,0.02119808,0.7711577,1.351304e-11,1.018979e-06,1.260758,-1.033152,2.54393,2.084694,24,101,24,19,1674.359571,False
Hsa-Mir-1307_5p,0.003786464,0.0008687033,0.08050305,0.03167021,-1.547311,1.698105,-1.409428,-1.546759,24,101,24,19,1837.650765,False
Hsa-Mir-1307_3p,0.2720807,0.02861507,0.004260915,0.0002904797,1.131723,1.303514,-1.504429,-2.219344,24,101,24,19,408.799052,False
Hsa-Mir-146-P2_5p,0.01289717,0.01383255,0.4311337,0.2437215,2.192985,-2.515165,1.225983,1.406132,24,101,24,19,13493.836423,False
Hsa-Mir-181-P2c_5p,1.361029e-09,0.145182,4.568812e-07,2.835075e-09,4.300938,1.217964,-1.939446,-10.158735,24,101,24,19,425.058685,False
Hsa-Mir-192-P1_5p,3.45197e-08,0.1915282,6.666351e-05,0.444413,-1.62905,1.145835,-1.536309,-1.08059,24,101,24,19,640180.597836,False
