Import required libraries

In [1]:
import pydot
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.core.display import display, HTML, Image
import qgrid
from statsmodels.stats.weightstats import ztest
from statsmodels.stats.weightstats import ttest_ind
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('../input/mirna.csv', sep=';')[['class','seq']]
df = df.drop(df.index[[10,26]]).reset_index()

del df['index']

pgm = df.iloc[0:10].reset_index()
solid = df.iloc[10:].reset_index()

del pgm['index']
del solid['index']

options = {
    'fullWidthRows': True,
    'syncColumnCellResize': True,
    'forceFitColumns': True,
    'defaultColumnWidth': 300,
    'rowHeight': 30,
    'enableColumnReorder': True,
    'enableTextSelectionOnCells': True,
    'editable': False,
    'autoEdit': True
}
pgm = pgm.set_index('class')
solid = solid.set_index('class')

pgm_w = widgets.HBox([qgrid.show_grid(pgm, show_toolbar=False, grid_options=options)])
solid_w = widgets.HBox([qgrid.show_grid(solid, show_toolbar=True, grid_options=options)])

tab = widgets.Tab([pgm_w, solid_w])
tab.set_title(0, 'PGM')
tab.set_title(1, 'Solid')
display(tab)

Building graphs to analysis patterns

In [3]:
def seq_graph_degree_count(df, inplace=False):
    
    assert ('seq') in df.columns
    
    edges = {}
    
    if not inplace:
        df = df.copy()
    
    degrees_in = {'a': [], 'u': [], 'g': [], 'c': []}
    degrees_out = {'a': [], 'u': [], 'g': [], 'c': []}
    
    for index, row in df.iterrows():

        letters = [i for i in row['seq'].lower()]

        assert len(letters) > 1
        
        degree_in = {'a': 0, 'u': 0, 'g': 0, 'c': 0}
        degree_out = {'a': 0, 'u': 0, 'g': 0, 'c': 0}
        
        for i in range(len(letters) - 1):
            degree_in[letters[i + 1]] += 1
            degree_out[letters[i]] += 1
            
            if letters[i] not in edges:
                edges[letters[i]] = {letters[i + 1]: 1}
                
            elif letters[i + 1] not in edges[letters[i]]:
                edges[letters[i]][letters[i + 1]] = 1
            
            else:
                edges[letters[i]][letters[i + 1]] += 1
                 
        for key in degree_in:
            degrees_in[key].append(degree_in[key])
            degrees_out[key].append(degree_out[key])
    
    for key in degrees_in:
        df[key + '_in'] = [float(x) for x in degrees_in[key]]
        df[key + '_out'] = [float(x) for x in degrees_out[key]]
        df[key + '_both'] = [float(x) for x in [sum(x) for x in zip(degrees_in[key], degrees_out[key])]]

        
    for i in edges:
        for j in edges[i]:
            edges[i][j] /= float(df.shape[0])
    
    return df, edges


edges__ = []

labels = ['PGM', 'SOLID']

df_all = None

for index, df in enumerate([pgm, solid]):
    
    display(HTML('<center><h1>' + labels[index] + '</h1></center>'))
    
    _, e = seq_graph_degree_count(df, inplace=True)
    
    edges__.append(e)

    display(qgrid.show_grid(df))
    
    df.to_csv('../output/' + labels[index] + '.csv', sep=',', header=True, index=True)
    
    df['group'] = labels[index]
    
    if df_all is None:
        df_all = df
    else:
        df_all = pd.concat([df_all, df])

In [4]:
display(HTML('<center><h1>Hypothesis Test</h1></center>'))

index = ['a_in', 'a_out', 'a_both', 'c_in', 'c_out', 'c_both', 'u_in', 'u_out', 
         'u_both', 'g_in', 'g_out', 'g_both']

pgm_sld_test = pd.DataFrame({'edge_type': index, 
                        'ztest': [ztest(pgm[c], solid[c])[1] for c in index], 
                        'ttest': [ttest_ind(pgm[c], solid[c])[1] for c in index]}).set_index('edge_type')

pgm_sld_test.to_csv('../output/pgm_sld_test.csv', sep=',', header=True, index=True)

qgrid.show_grid(pgm_sld_test)


### Reading Full miRNA Database (HSA)

In [5]:
def mirna_to_df(file_path, n_seq=None):
    
    columns = ['id', 'accession', 'seq']
    
    dataset = []
    
    with open(file_path) as f:
        
        count = 0
        
        while True:
            
            line1 = f.readline()
            
            line2 = f.readline().lower()
            
            if not line2: break
            
            dataset.append(line1[1:].split() + [line2[:-1]])
            
            count += 1
            
            if n_seq is not None and count >= n_seq:
                break
                
    return pd.DataFrame(dataset, columns=columns)

mirna_all = mirna_to_df('../input/hsa.mirna').set_index('id')

display(qgrid.show_grid(mirna_all, grid_options={'editable': True}))
            

### Reading subset for comparison

In [6]:
sequences = []

df = pd.read_csv('../input/mirnas_cut_off.csv')

for index, row in df.iterrows():
    
    sss = ''
    
    ids = row['id'].split('/')
    
    for _id in ids:
        try:
            sss += mirna_all.loc[_id]['seq']
        except:
            print(_id)

    if sss != '':
        sequences.append([row['id'], sss])

cutoff = pd.DataFrame(sequences, columns=['id', 'seq']).set_index('id')
cutoff['seq'] = cutoff['seq'].map(lambda x : x.lower())

display(qgrid.show_grid(cutoff, grid_options={'editable': True}))

### Comparing PGM and SOLID with Cutoff

In [7]:
cutoff_with_degree, _ = seq_graph_degree_count(cutoff)

display(qgrid.show_grid(cutoff_with_degree))

In [8]:
index = ['a_in', 'a_out', 'a_both', 'c_in', 'c_out', 'c_both', 'u_in', 'u_out', 
         'u_both', 'g_in', 'g_out', 'g_both']

for df in [pgm, solid]:
    df_test = pd.DataFrame({'edge_type': index, 
                            'ztest': [ztest(cutoff_with_degree[c], df[c])[1] for c in index], 
                            'ttest': [ttest_ind(cutoff_with_degree[c], df[c])[1] for c in index]}) \
    .set_index('edge_type')

    display(qgrid.show_grid(df_test))

### Comparing PGM and SLD with Cutoff (disjoint)

In [9]:
index = ['a_in', 'a_out', 'a_both', 'c_in', 'c_out', 'c_both', 'u_in', 'u_out', 
         'u_both', 'g_in', 'g_out', 'g_both']


cutoff_with_degree['seq'] = cutoff_with_degree['seq'].map(lambda x : x.lower())

for df in [pgm, solid]:
    
    cutoff_with_degree_disjoint = cutoff_with_degree[~cutoff_with_degree.seq.isin(df.seq.values)]
    
    df_test = pd.DataFrame({'index': index, 
                            'ztest': [ztest(cutoff_with_degree_disjoint[c], df[c])[1] for c in index], 
                            'ttest': [ttest_ind(cutoff_with_degree_disjoint[c], df[c])[1] for c in index]})

    display(qgrid.show_grid(df_test))

### Comparing PGM and SLD with All miRNA (hsa)

In [10]:
mirna_all_with_degree, e_all = seq_graph_degree_count(mirna_all, inplace=False)
edges__.append(e_all)

del mirna_all_with_degree['accession']

mirna_all_with_degree['group'] = 'miRNA'
mirna_all_with_degree = mirna_all_with_degree.reset_index()

del mirna_all_with_degree['id']

all__ = pd.concat([df_all, mirna_all_with_degree])
all__ = all__.reset_index()

del all__['index']

edge_types = [p + s for p in ['a', 'c', 'g', 'u'] for s in ['_in', '_out', '_both']]

d2 = None

for index, column in all__[edge_types].iteritems():
    if d2 is None:
        d2 = pd.DataFrame({'degree': column, 'edge_type': column.name, 'group': all__['group']})
    else:
        d2 = pd.concat([d2, pd.DataFrame({'degree': column, 'edge_type': column.name, 'group': all__['group']})])

d2 = d2.reset_index()
        
display(qgrid.show_grid(d2))

d2.to_csv('../output/pgm_solid_mirna_table.csv', sep=',', header=True, index=False)

In [11]:
index = ['a_in', 'a_out', 'a_both', 'c_in', 'c_out', 'c_both', 'u_in', 'u_out', 
         'u_both', 'g_in', 'g_out', 'g_both']

pgm_sld = None

labels = ['PGM', 'SOLiD']

for i, df in enumerate([pgm, solid]):
    
    display(HTML('<center><h2>' + labels[i] + ' and HSA</h2></center>'))
    
    df_test = None
   
    for alternative in ['two-sided', 'larger', 'smaller']:
        
        ddd = pd.DataFrame({'edge_type': index, 
                                    'ztest_' + alternative: [ztest(mirna_all_with_degree[c], df[c], 
                                                                   alternative=alternative)[1] for c in index], 
                                    'ttest_'+ alternative: [ttest_ind(mirna_all_with_degree[c], df[c], 
                                                                      alternative=alternative)[1] for c in index]})
        
        if df_test is None:
            df_test = ddd
        else:
            df_test = df_test.merge(ddd, how='inner', on='edge_type')

    df_test.to_csv('../output/' + labels[i].lower() + '_mir_test.csv', sep=',', header=True, index=True)
            
    display(qgrid.show_grid(df_test.set_index('edge_type')))

    

### Plotting Graph (PGM, SLD, HSA)

In [18]:
color_option = 4

colors = None

if color_option == 1:
    colors = list(reversed(['#BE0E32', '#B7170D', '#B0410C', '#A9690C', 
              '#A38D0B', '#8A9C0B', '#5F950A', '#388F0A',
              '#138809', '#098120']))

elif color_option == 2:
    colors = ['#FFFF1F', '#F9E71B', '#F3CF18', '#EEB815', '#E8A012',
          '#E3890F', '#DD710C', '#D75909', '#D24206', '#CC2A03', 
          '#C71300']

elif color_option == 2:
    colors = ['#9CE77D', '#A1DE6D', '#A9D65D', '#B4CD4F',
              '#C1C541', '#BCA934', '#B48828', '#AB661D',
              '#A34413', '#9A230A', '#910202']

elif color_option == 3:
    colors = ['#0025CC', '#0E21B7', '#1C1DA3', '#2A198E',
              '#39167A', '#471266', '#550E51', '#640B3D',
              '#720728', '#800314', '#8F0000']

elif color_option == 4:    
    colors = ['#1AA40D', '#36A30E', '#53A20E', '#6FA20E',
              '#8BA10E', '#A19B0F', '#A07E0F', '#9F620F',
              '#9F460F', '#9E2B0F', '#9E0F0F']

elif color_option == 5:    
    colors = ['#72E368', '#82E15E', '#96E055', '#ACDE4C',
              '#C5DD43', '#DBD43B', '#DAB332', '#D88E29',
              '#D76721', '#D53D19', '#D41111']

elif color_option == 6:    
    colors = ['#C4C4C4', '#C2AEAE', '#C09A9A', '#BE8585', '#BD7171',
              '#BB5D5D', '#B94A4A', '#B83737', '#B62424', '#B41212',
              '#B20000']
else:
    colors = ['#000000', '#111010', '#231C1C', '#352525',
              '#472A2A', '#592C2C', '#6B2A2A', '#7D2525',
              '#8F1C1C', '#A11010', '#B20000']

c = []

min_max = []
nodes_weights = []

for edges in edges__:
    
    values__ = [edges[i][j] for i in edges for j in edges[i]]
    
    min_max.append((min(values__), max(values__)))
    
    nw = {'a': 0, 'c': 0, 'u': 0, 'g': 0}
    
    for src in edges:
        for dst in edges[i]:
            nw[src] += edges[src][dst]
            nw[dst] += edges[src][dst]
    
    nodes_weights.append(nw)

edge_degree_min = min([i[0] for i in min_max])
edge_degree_max = max([i[1] for i in min_max])

node_degree_min = min([min(nodes_weights[i].values()) for i in range(len(nodes_weights))])
node_degree_max = max([max(nodes_weights[i].values()) for i in range(len(nodes_weights))])

for index, e in enumerate(edges__):
    
    graph = pydot.Dot(graph_type='digraph', layout='dot', pad=.25, 
                      nodesep=.8, ranksep=.8, splines='spline',
                      size='6.00cm,4.00cm')

    nodes = {}

    # creating nodes
    for i in e:
        
        node_weight = nodes_weights[index][i]
        
        color_index = int(10 * float(node_weight - node_degree_min) / (node_degree_max - node_degree_min))
        
        node = pydot.Node(i, style='filled', color='black', fillcolor=colors[color_index], 
                          fontname='ubuntu', shape='circle', penwidth=2.5, fontcolor='white', fontsize=16)
        nodes[i] = node
        graph.add_node(node)

    # creating edges
    for i in e:
        for j in e[i]:
            width = ((float(e[i][j] - edge_degree_min)) / (edge_degree_max - edge_degree_min))
            edge = pydot.Edge(nodes[i], nodes[j], penwidth=3. * width, color=colors[int(10 * width)])
            graph.add_edge(edge)

    file_ = 'graph' + str(index) + '.png'
    graph.write_png(file_)

    c.append(Image(filename=file_, height=250))

HTML('<center><table><tr><td><img src="graph0.png"/></td><td>' \
     '<img src="graph1.png"/></td><td><img src="graph2.png"/></td></tr></table></center>')

In [46]:
seq = 'UCUGAGGUGGAACAGCAGC'.lower()

_, e = seq_graph_degree_count(pd.DataFrame({'seq': [seq]}))

values__ = [e[i][j] for i in e for j in e[i]]
    
min_ = min(values__)
max_ = max(values__)

graph = pydot.Dot(graph_type='digraph', layout='dot', pad=.25, 
                      nodesep=.8, ranksep=.8, splines='spline',
                      size='6.00cm,4.00cm')

nodes = {}

# creating nodes
for i in e:

    node_weight = nodes_weights[index][i]

    color_index = int(10 * float(node_weight - node_degree_min) / (node_degree_max - node_degree_min))

    node = pydot.Node(i, style='filled', color='black', fillcolor=colors[color_index], 
                      fontname='ubuntu', shape='circle', penwidth=2.5, fontcolor='white', fontsize=16)
    nodes[i] = node
    graph.add_node(node)

# creating edges
for i in e:
    for j in e[i]:
        width = ((float(e[i][j] - min_)) / (max_ - min_))
        edge = pydot.Edge(nodes[i], nodes[j], penwidth=max(.5, 3. * width), color=colors[max(1, int(10 * width))])
        graph.add_edge(edge)

file_ = 'graph_example.png'
graph.write_png(file_)

HTML('<center><img src="graph_example.png"/></center>')

### Window Sliding

In [13]:
for df in [pgm, solid]:
    
    counts_uu = []
    counts_uuu = []
    
    for index, row in df.iterrows():

        pattern_count_uu = 0
        pattern_count_uuu = 0

        seq = row['seq']

        # Window with size 2
        for i in range(len(seq) - 1):
            pattern_count_uu += ''.join(set(seq[i:(i+2)])) == 'u'

        counts_uu += [pattern_count_uu]
        
        # Window with size 2
        for i in range(len(seq) - 2):
            pattern_count_uuu += ''.join(set(seq[i:(i+3)])) == 'u'

        counts_uuu += [pattern_count_uuu]

    df['counts_uu'] = counts_uu
    df['counts_uuu'] = counts_uuu
    
columns = ['seq', 'counts_uu', 'counts_uuu']
display(qgrid.show_grid(pgm[columns]))
display(qgrid.show_grid(solid[columns]))

### Z-Test

In [14]:
print('p-value : ' + str(ztest(pgm['counts_uu'], solid['counts_uu'])[1]))
print('p-value : ' + str(ztest(pgm['counts_uuu'], solid['counts_uuu'])[1]))

p-value : 0.0562756453855
p-value : 0.109598583399


### T-Test

In [15]:
print('p-value : ' + str(ttest_ind(pgm['counts_uu'], solid['counts_uu'])[1]))
print('p-value : ' + str(ttest_ind(pgm['counts_uuu'], solid['counts_uuu'])[1]))

p-value : 0.0688308204309
p-value : 0.123246332886
