# Create an interactive Parallel Plot
To demonstrate the use of the interactive parallel plot, we use a project already loaded into the CKG database.

In [30]:
import pandas as pd
from report_manager import project, dataset, report
from analytics_core.viz import viz as plots
import networkx as nx
from networkx.readwrite import json_graph
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from scipy.stats import zscore
init_notebook_mode(connected=True)
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import warnings
warnings.filterwarnings('ignore')
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

#### We create a new project object and load the respective data and report

In [31]:
my_project = project.Project(identifier='P0000001', datasets={}, report={})
my_project.load_project_data()
my_project.load_project_report()

#### We can now access to all the results for each data type

In [5]:
my_project.list_datasets()

dict_keys(['multiomics', 'clinical', 'proteomics'])

#### We will use the results from the proteomics analyses. We access the dataset 'proteomics' for further analysis

In [6]:
proteomics_dataset = my_project.get_dataset('proteomics')

#### The available analysis for this dataset are:

In [7]:
my_project.get_dataset('proteomics').list_dataframes()

['correlation_correlation',
 'go annotation',
 'go_enrichment_Biological_processes_regulation_enrichment',
 'number of modified proteins',
 'number of peptides',
 'number of proteins',
 'original',
 'overview statistics_summary',
 'pathway annotation',
 'pathway_enrichment_Pathways_regulation_enrichment',
 'processed',
 'protein biomarkers',
 'regulated',
 'regulation table']

#### We can access the different dataframes like this:

In [8]:
my_project.get_dataset('proteomics').get_dataframe('go annotation')

Unnamed: 0,annotation,group,identifier,source
0,mitochondrial genome maintenance,,TYMP~P19971,UniProt
1,maltose metabolic process,,GAA~P10253,UniProt
2,maltose metabolic process,,MGAM~O43451,UniProt
3,ribosomal large subunit assembly,,RPL6~Q02878,UniProt
4,ribosomal large subunit assembly,,RPL11~P62913,UniProt
5,ribosomal large subunit assembly,,RPLP0~P05388,UniProt
6,ribosomal large subunit assembly,,RPL3~P39023,UniProt
7,ribosomal small subunit assembly,,RPS14~P62263,UniProt
8,ribosomal small subunit assembly,,RPS5~P46782,UniProt
9,ribosomal small subunit assembly,,RPS19~P39019,UniProt


#### In this case, we will use the the processed dataframe with transformed and imputed LFQ intensities. We then normalize the data using Z Score.

In [9]:
proteomics_dataset = my_project.get_dataset('proteomics')
processed_df = proteomics_dataset.get_dataframe('processed')

In [10]:
processed_df.head()

Unnamed: 0,A2M~P01023,A30~A2MYE2,ABI3BP~Q7Z7G0,ACE~P12821,ACTB~P60709,ACTN1~P12814,ADA2~Q9NZK5,ADAMTS13~Q76LX8,ADAMTSL4~Q6UY14,ADH4~P08319,...,VIM~P08670,VK3~A2N2F4,VNN1~O95497,VTN~P04004,VWF~P04275,YWHAZ~P63104,group,sample,scFv~Q65ZC9,subject
0,38.005564,28.173504,21.540133,22.16542,27.09033,25.039968,23.442151,24.010605,25.08582,23.389032,...,24.178889,25.835908,22.480055,32.815815,28.922779,19.198487,Cirrhosis,AS68,27.788928,S68
1,37.309118,27.981907,27.342062,23.84727,27.461155,25.896268,23.754503,24.135818,19.18359,22.148706,...,23.709777,25.004889,23.852908,32.722121,29.881279,22.141285,Cirrhosis,AS69,26.869972,S69
2,37.384952,28.857627,20.123246,22.86363,27.929764,24.295225,23.359443,24.121788,24.923476,23.017163,...,23.599064,26.27165,24.232132,32.755752,29.444625,18.86937,Cirrhosis,AS70,28.069328,S70
3,38.417225,28.97838,25.50191,22.992774,27.152479,25.231288,23.70134,24.568309,24.878802,26.388112,...,24.179076,25.9292,24.269047,32.714014,29.397176,22.216971,Cirrhosis,AS71,28.170209,S71
4,37.471303,28.748744,21.906161,22.203381,27.537048,22.392992,22.406264,24.961173,21.995507,24.33954,...,23.865224,26.70134,20.755495,32.722691,28.540895,20.880564,Cirrhosis,AS72,28.61228,S72


In [11]:
processed_df = processed_df.drop(['sample', 'subject'], axis=1).set_index('group').apply(zscore).reset_index()

#### In order to find clusters of proteins, we access the report and the protein-protein correlation network as a dictionary.

In [12]:
proteomics_report = my_project.get_dataset('proteomics').report
proteomics_report.list_plots()

dict_keys(['0_date', '0~proteomics_pipeline~cytoscape_network', '10~coefficient_variation_coefficient_of_variation~scatterplot_matrix', '11~stratification_description~description', '12~stratification_pca~pca', '13~regulation_description~description', '14~regulation_samr~basicTable', '15~regulation_samr~volcanoplot', '16~correlation_correlation~network', '17~interaction_network~network', '18~drug_associations~basicTable', '19~disease_associations~basicTable', '1~overview statistics_summary~multiTable', '20~literature_associations_publications_abstracts~basicTable', '21~literature_associations_publications_abstracts~wordcloud', '22~go_enrichment_Biological_processes_regulation_enrichment~basicTable', '23~pathway_enrichment_Pathways_regulation_enrichment~basicTable', '2~peptides~barplot', '3~peptides~basicTable', '4~proteins~barplot', '5~proteins~basicTable', '6~modifications~facetplot', '7~modifications~basicTable', '8~ranking_ranking_with_markers~ranking', '9~ranking_ranking_with_marker

In [13]:
correlation_net_dict = proteomics_report.get_plot('16~correlation_correlation~network')[0]

#### To convert the dictionary into a network, we access the json version within the dictionary and convert it using the networkX package.

In [14]:
correlation_net = json_graph.node_link_graph(correlation_net_dict['net_json'])

#### Now that we have a network with proteins colored by cluster, we can convert this information into a dataframe to be used in this Jupyter Notebook.

In [15]:
correlation_df = pd.DataFrame.from_dict(correlation_net.nodes(data=True))
correlation_df = correlation_df[0].to_frame().join(correlation_df[1].apply(pd.Series))

In [16]:
correlation_df.columns = ['identifier', 'degree', 'radius', 'color', 'cluster']

#### Since the correlation network was generated using cut-off , not all the proteins in the processed dataframe are part of a cluster, therefore we filter the processed dataframe and keep only the proteins that are present in the correlation clusters.

In [17]:
min_val = processed_df._get_numeric_data().min().min().round()
max_val = processed_df._get_numeric_data().max().max().round()
processed_df = processed_df[list(correlation_df.identifier) + ['group']]

#### Ready! To build the parallel plot, we create a dictionary with the clusters and respectives colors, and filter the processed dataframe to include only the proteins in a specific cluster.
Using the Jupyter Widgets **interact** function, we can make the plot interactive and allow the visualization of a cluster selected by the user.

In [27]:
from IPython.core.display import display, HTML

In [28]:
@interact
def plot_parallel_plot(cluster=correlation_df.cluster.unique()):
    cluster_colors = dict(zip(correlation_df.cluster, correlation_df.color))
    clusters = correlation_df.groupby('cluster')
    identifiers = clusters.get_group(cluster)['identifier'].tolist()
    title= "Parallel plot cluster: {}".format(cluster)
    df = processed_df.set_index('group')[identifiers].reset_index()
    figure = plots.get_parallel_plot(df, identifier=cluster, args={'color':cluster_colors[cluster],'group':'group', 
                                                                          'title':title,
                                                                          'zscore':False})
    display(HTML("<p>{}</p>".format(",".join(identifiers))))
    iplot(figure.figure)

interactive(children=(Dropdown(description='cluster', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…