In [1]:
%load_ext autoreload
%autoreload 2

In [111]:
import pandas as pd
from itertools import combinations, chain

from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.cluster import AgglomerativeClustering

from pkdb_analysis import PKDB, PKData
#my utils
from circos_utils import substance_cooccurrence_matrix \
    ,study_expand, bubbles_data, find_label, add_label_and_type, create_config_files, substance_combinations \
    ,create_ideogram, create_links

import matplotlib
matplotlib.use("TkAgg")


## Configuration for Circular Plots

This notebook creates the configuration files for the circular plots.

Tasks performed by the notebook:

 1. Classifies substances by co-occurrence in studies. Derived substances are replaced but their parents already by the database. Label for a class is chosen to be most frequent substance within a class.
 2. Classifies studies into the same classes as substances. The most frequent class of substances used in a study is chosen to be the class of the study.
 3. Builds configuration files for circus plots for:
     1. Circular study plot with bars .  &rightarrow;  /circos/study/
     2. Circular study plot with bubbles.   &rightarrow;  /circos/study1/
     3. Circular plots with subset of study plots for each substance class. &rightarrow;  /circos/substance1/
     4. Circular substance plots with co-occurrence links and bubbles. &rightarrow;  /circos/study_caffeine/ ...
     

## 1. Classifies Substances and Studies 

In [3]:
# load data
pkdata = PKDB.query()

INFO *** Querying data ***
INFO http://0.0.0.0:8000/api/v1/studies/?format=json&page_size=2000
INFO http://0.0.0.0:8000/api/v1/studies/?format=json&page_size=2000&page=1
INFO http://0.0.0.0:8000/api/v1/interventions_analysis/?format=json&page_size=2000&normed=true
INFO http://0.0.0.0:8000/api/v1/interventions_analysis/?format=json&page_size=2000&normed=true&page=1
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=1
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=2
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=3
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=4
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=5
INFO http://0.0.0.0:8000/api/v1/individuals_analysis/?format=json&page_size=2000&page=6
INFO http://0.0.0.0:8000

In [230]:
substance_coocurance = pd.DataFrame([],columns=["substance1", "substance2", "study"])
for r, study in pkdata.studies.iterrows():
    substance_c = combinations(study["substances"], 2)
    substance_c = pd.DataFrame(substance_c, columns=["substance1", "substance2"])
    substance_c["study"] = study.sid
    substance_coocurance = substance_coocurance.append(substance_c)
    


In [231]:
from pysankey import sankey


In [232]:
substance_coocurance

Unnamed: 0,substance1,substance2,study
0,paracetamol glucuronide,paracetamol mercapturate,PKDB00258
1,paracetamol glucuronide,paracetamol cysteine,PKDB00258
2,paracetamol glucuronide,paracetamol,PKDB00258
3,paracetamol glucuronide,paracetamol sulfate,PKDB00258
4,paracetamol glucuronide,chloroquine,PKDB00258
...,...,...,...
23,atorvastatin lactone,atorvastatin acid,10460065
24,atorvastatin lactone,2-hydroxyatorvastatin acid,10460065
25,atorvastatin,atorvastatin acid,10460065
26,atorvastatin,2-hydroxyatorvastatin acid,10460065


In [233]:
import numpy as np
s1_sorted = np.sort(substance_coocurance.substance1.unique())
s2_sorted = np.sort(substance_coocurance.substance2.unique())

In [234]:
substances_combinations2 = pd.DataFrame()
substance_coocurance["occurance"] = 1
substances_combinations2["substance1"] = substance_coocurance["substance2"]
substances_combinations2["substance2"] = substance_coocurance["substance1"]
substances_combinations2["occurance"] = substance_coocurance["occurance"]
com = substances_combinations2.append(substance_coocurance, ignore_index=True)
coocurence_matrix = com.pivot_table(index="substance1", columns="substance2", aggfunc="sum").fillna(0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [235]:
tips = sns.load_dataset("tips")


In [236]:
pkdata.outputs

Unnamed: 0,output_pk,intervention_pk,study_sid,study_name,group_pk,individual_pk,normed,calculated,tissue,time,...,substance,value,mean,median,min,max,sd,se,cv,unit
0,1913,0,PKDB00198,Abernethy1982,18,-1,True,False,plasma,,...,paracetamol,,2.550000,,1.9900,3.4700,,,,hour
1,1914,0,PKDB00198,Abernethy1982,18,-1,True,False,plasma,,...,paracetamol,,108.500000,,62.2000,151.4000,,,,liter
2,1915,0,PKDB00198,Abernethy1982,18,-1,True,False,plasma,,...,paracetamol,,0.810000,,0.5300,1.3100,,,,liter / kilogram
3,1916,0,PKDB00198,Abernethy1982,18,-1,True,False,plasma,,...,paracetamol,,29.040000,,19.4400,38.7600,,,,liter / hour
4,1917,0,PKDB00198,Abernethy1982,18,-1,True,False,plasma,,...,paracetamol,,0.224400,,0.1452,0.3156,,,,liter / hour / kilogram
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28205,58548,787,11112089,Whitfield2000,891,-1,True,True,plasma,,...,atorvastatin,,0.000005,,,,,,,gram / liter
28206,58549,787,11112089,Whitfield2000,891,-1,True,True,plasma,,...,atorvastatin,,0.000567,,,,,,,1 / minute
28207,58550,787,11112089,Whitfield2000,891,-1,True,True,plasma,,...,atorvastatin,,20.380343,,,,,,,hour
28208,58551,787,11112089,Whitfield2000,891,-1,True,True,plasma,,...,atorvastatin,,1811.538512,,,,,,,liter


In [237]:
g = sns.FacetGrid(pkdata.outputs, row="study_sid", margin_titles=True, height=1)
#g.map(sns.regplot, "size", "total_bill", color=".3", fit_reg=False, x_jitter=.1);

  fig, axes = plt.subplots(nrow, ncol, **kwargs)


In [2]:
df = pd.DataFrame([])
df["outputs"] = pkdata.outputs.groupby("study_name").count()["output_pk"]
df["timecourses"] = pkdata.timecourses.groupby("study_name").count()["timecourse_pk"]
df["interventions"] = pkdata.interventions.groupby("study_name").count()["intervention_pk"]
df["individuals"] = pkdata.individuals.groupby("study_name").count()["individual_pk"]
df["groups"] = pkdata.groups.groupby("study_name").count()["group_pk"]

NameError: name 'pd' is not defined

In [1]:
df

NameError: name 'df' is not defined

In [239]:
df = df.fillna(0).astype(int)

In [240]:
from sklearn.preprocessing import MinMaxScaler

In [145]:
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

In [148]:
f, ax = plt.subplots(figsize=(9, 80))

cm = sns.heatmap(df_scaled, annot=df, linewidths=.9,fmt=".0f", cmap="binary", ax=ax, cbar=False)
f.savefig("test.png",bbox_inches = 'tight')


  """Entry point for launching an IPython kernel.


In [241]:
norm_coocurence_matrix = Normalizer(norm="l2").fit_transform(coocurence_matrix)
norm_coocurence_matrix = pd.DataFrame(norm_coocurence_matrix, index=coocurence_matrix.index, columns=coocurence_matrix.columns)


In [242]:
json_substances = pd.DataFrame(coocurence_matrix.apply(sum, axis=1)).reset_index().rename(columns={"substance1":"name", 0:"n"})

In [243]:
json_substances["id"] = json_substances["name"]


In [244]:
links  = pd.DataFrame()

In [245]:
map_s = pd.Series(coocurence_matrix.index).reset_index().set_index("substance1")

In [246]:
com["s1"] = com["substance1"].apply(lambda x: map_s.loc[x])
com["s2"] = com["substance2"].apply(lambda x: map_s.loc[x])

In [247]:
links["source"] = com["s1"]

links["target"] = com["s2"]
links["value"] = com["occurance"]


In [248]:
links

Unnamed: 0,source,target,value
0,126,124,1
1,123,124,1
2,122,124,1
3,127,124,1
4,51,124,1
...,...,...,...
4933,46,45,1
4934,46,14,1
4935,44,45,1
4936,44,14,1


In [185]:
substance_coocurance

Unnamed: 0,substance1,substance2,study,occurance
0,paracetamol glucuronide,paracetamol mercapturate,PKDB00258,1
1,paracetamol glucuronide,paracetamol cysteine,PKDB00258,1
2,paracetamol glucuronide,paracetamol,PKDB00258,1
3,paracetamol glucuronide,paracetamol sulfate,PKDB00258,1
4,paracetamol glucuronide,chloroquine,PKDB00258,1
...,...,...,...,...
23,atorvastatin lactone,atorvastatin acid,10460065,1
24,atorvastatin lactone,2-hydroxyatorvastatin acid,10460065,1
25,atorvastatin,atorvastatin acid,10460065,1
26,atorvastatin,2-hydroxyatorvastatin acid,10460065,1


In [159]:
import json
pd.DataFrame(coocurence_matrix.apply(sum))#.to_json(orient="records")

coocurence_matrix.apply(sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,substance2,Unnamed: 2_level_1
occurance,1-hydroxymidazolam,65.0
occurance,1-hydroxymidazolam glucuronide,4.0
occurance,137MU,11.0
occurance,13C-co2,8.0
occurance,13C-methacetin,8.0
occurance,...,...
occurance,triazolam,4.0
occurance,troleandomycin,12.0
occurance,venlafaxine,7.0
occurance,warfarin,9.0


In [30]:
classification_model = AgglomerativeClustering(linkage="single",n_clusters=7)
classes = classification_model.fit_predict(norm_coocurence_matrix.T.corr())

In [264]:
pd.Series(classes, index=norm_coocurence_matrix.index)

substance1
1-hydroxymidazolam                2
1-hydroxymidazolam glucuronide    2
137MU                             6
13C-co2                           4
13C-methacetin                    4
                                 ..
triazolam                         2
troleandomycin                    2
venlafaxine                       6
warfarin                          2
watercress                        1
Length: 168, dtype: int64

In [58]:
import seaborn as sns
iter_c = iter(sns.color_palette())

In [59]:
colorDict = {i: next(iter_c) for i in range(0,7)}

In [60]:
colorDict

{0: (0.12156862745098039, 0.4666666666666667, 0.7058823529411765),
 1: (1.0, 0.4980392156862745, 0.054901960784313725),
 2: (0.17254901960784313, 0.6274509803921569, 0.17254901960784313),
 3: (0.8392156862745098, 0.15294117647058825, 0.1568627450980392),
 4: (0.5803921568627451, 0.403921568627451, 0.7411764705882353),
 5: (0.5490196078431373, 0.33725490196078434, 0.29411764705882354),
 6: (0.8901960784313725, 0.4666666666666667, 0.7607843137254902)}

In [61]:
classes

array([2, 2, 6, 4, 4, 6, 2, 6, 6, 6, 6, 6, 6, 0, 3, 3, 0, 0, 6, 0, 1, 2,
       0, 2, 6, 6, 6, 6, 6, 6, 3, 5, 5, 5, 6, 3, 2, 5, 5, 5, 5, 1, 2, 1,
       3, 3, 3, 1, 5, 6, 6, 1, 6, 1, 6, 1, 2, 2, 0, 0, 0, 0, 5, 1, 0, 1,
       0, 0, 0, 2, 6, 6, 5, 2, 1, 6, 5, 2, 1, 6, 6, 5, 5, 6, 2, 5, 1, 0,
       1, 6, 5, 1, 2, 2, 1, 5, 6, 1, 6, 2, 2, 1, 0, 1, 2, 2, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 5, 6, 0, 0, 6, 2, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1,
       0, 6, 1, 2, 1, 1, 1, 1, 0, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 5, 1, 0,
       0, 1, 1, 0, 0, 1, 6, 6, 2, 2, 2, 6, 2, 1])

In [62]:
# Classfied substances into types
substance_to_type = pd.DataFrame(classes, index=norm_coocurence_matrix.index, columns=["type"])#.to_dict()["type"]

#studies.data, substances.data =  add_label_and_type(studies.data, substances.data, substance_to_type)

In [63]:
substance_to_type["color"] = substance_to_type.type.apply(lambda x: colorDict[x])

In [256]:
json_substances["group"] = classes
json_substances["nodeName"] = json_substances["name"]
json_substances["grp"] = classes

In [203]:
json_substances["n"] = json_substances["n"].astype(int)

In [253]:
links = links.groupby(["source","target"]).sum().reset_index()

In [254]:
json_dict = {}
json_dict["nodes"] = json_substances.to_dict(orient="records")
json_dict["links"] = links.to_dict(orient="records")


In [257]:
json_dict["nodes"] = json_substances[["group","name"]].to_dict(orient="records")

In [258]:
with open('test1.json', 'w') as json_file:
    json.dump(json_dict, json_file)

In [69]:

sankey(com['substance1'], com['substance2'], aspect=10,  fontsize=12, figureName="fruit", leftLabels= substance_to_type.sort_values("type").index
, rightLabels=substance_to_type.sort_values("type").index,colorDict= substance_to_type["color"].to_dict()) 



In [37]:
studies.data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
30,Loetsch2006,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"M K, J G, D E",1,11,"[{'count': 11, 'name': 'all'}]",11,"['P11', 'P10', 'P9', 'P8', 'P7', 'P6', 'P5', '...",3,691,691,132,823,3.0,codeine
130,Arias1988,"5-dehydrosparteine, sparteine sulfate, 2-dehyd...",J G,"J G, M K",1,142,"[{'count': 142, 'name': 'all'}]",142,"['142', '141', '137', '136', '134', '132', '13...",1,568,0,0,568,3.0,codeine
7,Chen1991,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"M K, J G, D E",3,8,"[{'count': 8, 'name': 'all'}, {'count': 1, 'na...",39,"['P_B16', 'P_B15', 'P_B14', 'P_B13', 'P_B12', ...",9,494,28,4,498,3.0,codeine
31,Yue1991B,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"J G, M K",5,20,"[{'count': 20, 'name': 'all'}, {'count': 8, 'n...",16,"['14_PM', '13_PM', '12_PM', '11_PM', '10_PM', ...",11,413,83,11,424,3.0,codeine
107,Liukas2011,"paracetamol sulfate, paracetamol glucuronide, ...",J B,"J B, M K",5,40,"[{'count': 10, 'name': '20-40'}, {'count': 10,...",238,"['Individual 238', 'Individual 237', 'Individu...",1,385,91,12,397,1.0,paracetamol
38,Wu2014,"morphine-3-glucuronide, morphine-6-glucuronide...",J G,"M K, J G, D E",4,29,"[{'count': 29, 'name': 'all'}, {'count': 10, '...",208,"['1_10/10_MOR_AUC', '10_1/10_MOR_AUC', '9_1/10...",1,379,96,12,391,3.0,codeine
13,Thummel1996,"midazolam, 1-hydroxymidazolam",Y D,"Y D, M K",3,20,"[{'count': 20, 'name': 'all'}, {'count': 10, '...",20,"['p20', 'p19', 'p18', 'p17', 'p16', 'p15', 'p1...",2,357,64,8,365,0.0,midazolam
99,Kietzmann1990,"paracetamol, paracetamol sulfate, paracetamol ...",J B,"M K, J B",3,13,"[{'count': 13, 'name': 'all'}, {'count': 7, 'n...",39,"['case26', 'case25', 'case24', 'case23', 'case...",1,320,203,26,346,1.0,paracetamol
23,Vree1992,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"J G, M K",1,8,"[{'count': 8, 'name': 'all'}]",8,"['P8', 'P7', 'P6', 'P5', 'P4', 'P3', 'P2', 'P1']",1,306,40,14,320,3.0,codeine
35,Yue1989-2,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"J G, M K",3,132,"[{'count': 132, 'name': 'all'}, {'count': 18, ...",132,"['132_EM', '131_EM', '130_EM', '129_EM', '128_...",2,301,0,0,301,3.0,codeine


In [38]:
studies_data = studies.data.sort_values(by=["label","name"])

In [39]:
import numpy as np

In [40]:
studies_data["group_all_count"]= studies_data["group_all_count"].replace(np.nan, 0)
studies_data["group_all_count"]= studies_data["group_all_count"].astype(int)

## 2. Build config files for circular plots

In [41]:
# for study data
create_config_files(studies_data, "circos/study_final")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["start"] = range(0,len(data)*rows_per_study,rows_per_study)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["end"] = range(rows_per_study,(len(data)+1)*rows_per_study,rows_per_study)


In [42]:
studies_data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
8,Ammon2002,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"J G, M K, D E",1,12,"[{'count': 12, 'name': 'all'}]",0,,3,153,70,10,163,3.0,codeine
130,Arias1988,"5-dehydrosparteine, sparteine sulfate, 2-dehyd...",J G,"J G, M K",1,142,"[{'count': 142, 'name': 'all'}]",142,"['142', '141', '137', '136', '134', '132', '13...",1,568,0,0,568,3.0,codeine
11,Caraco1996,"mephenytoin, morphine-6-glucuronide, codeine-6...",J G,"J G, M K",3,16,"[{'count': 16, 'name': 'all'}, {'count': 6, 'n...",15,"['6_PM', '5_PM', '4_PM', '3_PM', '2_PM', '1_PM...",4,179,96,12,191,3.0,codeine
7,Chen1991,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"M K, J G, D E",3,8,"[{'count': 8, 'name': 'all'}, {'count': 1, 'na...",39,"['P_B16', 'P_B15', 'P_B14', 'P_B13', 'P_B12', ...",9,494,28,4,498,3.0,codeine
15,Desmeules1991,"dextrorphan, morphine, dextromethorphan, codei...",M K,"M K, D E",2,8,"[{'count': 8, 'name': 'all'}, {'count': 7, 'na...",1,['pm1'],3,29,15,3,32,3.0,codeine
10,Hasselstrom1997,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"J G, M K",3,24,"[{'count': 24, 'name': 'all'}, {'count': 12, '...",0,,2,20,0,0,20,3.0,codeine
20,He2008,"morphine-6-glucuronide, codeine-6-glucuronide,...",J G,"M K, D E, J G",3,23,"[{'count': 23, 'name': 'all'}, {'count': 11, '...",72,"['P_E38', 'P_E37', 'P_E36', 'P_E35', 'P_E34', ...",1,74,0,4,78,3.0,codeine
37,Kim2002,"codeine, codeine sulphate, norcodeine",J G,"J G, M K",5,19,"[{'count': 19, 'name': 'all'}, {'count': 17, '...",0,,2,91,59,8,99,3.0,codeine
17,Kirchheiner2007,"morphine-6-glucuronide, morphine, codeine, cod...",J G,"M K, J G, D E",4,26,"[{'count': 26, 'name': 'all'}, {'count': 3, 'n...",0,,1,96,96,12,108,3.0,codeine
18,Kronstrand2001,"codeine, morphine, codeine phosphate",J G,"M K, D E, J G",1,9,"[{'count': 9, 'name': 'all'}]",0,,1,17,16,3,20,3.0,codeine


In [43]:
for label, study_data_label in studies_data.groupby("label"):
    if label =="paracetamol":
        create_config_files(study_data_label, f"circos/study_{label}")


In [44]:
# testing for substances 

In [45]:
frames = []
for substance, data in substances.data.sort_values("label").groupby("label"):
    data["start"] = range(len(data)) 
    data["end"] = range(1,len(data)+1) 
    frames.append(data)
substances_data = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [46]:
#substances_data

In [47]:
len(substances_data)

71

In [48]:
unused_substances = substances_data[ (substances_data["study_number"]<2) | ((substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0))]["name"]

In [49]:
substances_data = substances_data[ ~((substances_data["study_number"]<2) | ( (substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0)))]

In [50]:
len(substances_data)

29

In [51]:
list(unused_substances)

['sulfapyridine',
 'normorphine-glucuronide',
 'dextrorphan',
 '2-dehydrosparteine',
 '5-dehydrosparteine',
 'diclofenac',
 '4-hydroxydebrisoquine',
 'sulfasalazine',
 'mephenytoin',
 'norcodeine-conjugates',
 'ketoconazole',
 'grapefruit juice',
 'atropine',
 'panadol extend',
 'acetylsalicylic acid',
 '4-hydroxyflurbiprofen',
 'nizatidine',
 'imatinib',
 'curcuminoids',
 'flurbiprofen',
 'isoniazide',
 'salicylamide glucuronide',
 'l-cysteine',
 'lignocaine',
 'meperidine',
 'naloxone',
 'cisapride',
 'salicylamide',
 'sulfinpyrazone',
 'probenecid',
 'sulfamethizole acetylated',
 'sorbitol',
 'salicylic acid',
 'phenylbutazone',
 'salicylamide sulfate',
 'propanolol',
 'piperine',
 'pentazocine',
 'sulfamethizole',
 'watercress',
 'pipemidic acid',
 'norfloxacin']

In [52]:
all_links = create_links(substances_data,studies_data,list(unused_substances))


In [53]:
length = pd.DataFrame()
length["substance_links"] = all_links["substance1_start"].append(all_links["substance2_start"])
length["label"] = all_links["substance1_label"].append(all_links["substance2_label"])

In [54]:
frames = []
for label, data_label in length.groupby(["label"]):
    start = 0 
    
    
    for substance_link, substance_linkpositions in data_label.groupby(["substance_links"]):    
        label_position = pd.Series()
        label_position["start"] = start
        label_position["end"] = start + len(substance_linkpositions)
        label_position["label"]  = label
        label_position["substance_number"]  = substance_link

        frames.append(label_position)
        start = label_position["end"] + 1
res = pd.concat(frames, axis=1).T    
                                                         
                                            

In [55]:
substances_data["start"]

63     0
13     2
31     4
17     5
56     7
64     8
16    12
40    15
36    16
14    17
32    18
33    19
38    20
12    21
30    23
11    24
29     0
0      1
43     6
45    19
48    20
44    21
47    22
46    23
6     24
8     25
9     26
54    27
57    29
Name: start, dtype: int64

In [56]:
res["substance_number"] = res["substance_number"].astype(int)

In [57]:
substance_data = pd.merge(res,substances_data[["name","label","start","study_number","timecourse_number","output_number","output_raw_number","output_calculated_number","intervention_number"]],left_on=["label","substance_number"], right_on = ["label","start"])
substance_data["start"] = substance_data["start_x"]

In [58]:
links_new = pd.merge(all_links, substance_data[["start","end","start_y","label"]], left_on=["substance1_label","substance1_start"], right_on = [ "label","start_y"])
links_new[["substance1_start","substance1_end"]] = links_new[["start","end"]]
links_new = pd.merge(links_new, substance_data[["start","end","start_y","label"]], left_on=["substance2_label","substance2_start"], right_on = [ "label","start_y"])
links_new[["substance2_start","substance2_end"]] = links_new[["start_y","end_y"]]


In [59]:
all_links = links_new[["substance1_label","substance1_start" ,"substance1_end", "substance2_label","substance2_start" ,"substance2_end"]]

In [60]:
frames = []
for (label,start), data in all_links.groupby(["substance1_label","substance1_start"]):
    data["substance1_start"] = start + range(len(data))
    data["substance1_end"] = data["substance1_start"] + 1
    data["substance1_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [61]:
frames = []
for (label,end), data in all_links.groupby(["substance2_label","substance2_end"]):
    data["substance2_end"] = end - range(len(data)) 
    data["substance2_start"] = data["substance2_end"] - 1
    data["substance2_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [62]:
all_links.to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [63]:
ideogram = pd.DataFrame(res.sort_values("label").groupby(["label"]).apply(lambda x:  x["end"].max()),columns = ["len"])
ideogram["label"] = ideogram.index

In [64]:
ideogram

Unnamed: 0_level_0,len,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
codeine,1181,codeine
midazolam,19,midazolam
paracetamol,624,paracetamol


In [65]:
ticks_dict = {}
for label, data in substance_data.groupby("label"):
    mystring = ""
    for x in list(data["start"]):
        mystring += f"{x}u, "
    data["end"].max()
    mystring += f"{data['end'].max()}u"
    ticks_dict[label]=mystring

In [66]:
from pprint import pprint
pprint(ticks_dict)

{'codeine': '0u, 20u, 26u, 34u, 51u, 69u, 85u, 142u, 230u, 339u, 457u, 572u, '
            '690u, 772u, 895u, 1037u, 1181u',
 'midazolam': '0u, 11u, 19u',
 'paracetamol': '0u, 147u, 280u, 410u, 494u, 573u, 589u, 602u, 609u, 616u, '
                '619u, 624u'}


In [67]:
directory = "circos/substance_final"
with open(f"{directory}/data/ideogram.txt","w") as f:
        number = 0
        for idx, substance in ideogram.iterrows():

            number = number + 1
            #color_mapping[substance.label] = f"fill_color=spectral-5-div-{number}"
            f.write(f"chr - {substance.label} {substance.label} 0 {substance.len} pastel2-6-qual-{number}\n")


In [68]:
#names 2d track
substance_data["name"] = substance_data["name"].apply(lambda x: x.replace(" ","&nbsp;"))
substance_data[["label","start","end","name"]].to_csv(f"{directory}/data/substance_names.txt", sep=" ",header=False,index=False)
# all subjects number for number track
substance_data[["label","start","end","study_number"]].to_csv(f"{directory}/data/study_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","timecourse_number"]].to_csv(f"{directory}/data/timecourse_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","output_number"]].to_csv(f"{directory}/data/output_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","intervention_number"]].to_csv(f"{directory}/data/intervention_number.txt", sep=" ",header=False,index=False)

bubbles_data_dict = bubbles_data(substance_data,25,"substance")
for name, data in bubbles_data_dict.items():
        data[["label","start","end","type","circle_type"]].to_csv(f"{directory}/data/{name}_bubble.txt", sep=" ",header=False,index=False)


In [None]:
#old stlye write upper part into utlis !

#create_config_files(substances_data, "circos/substance_final", "substance")
#create_links(substances_data,studies_data).to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [None]:
#create_links(substances_data,studies_data)