In [1]:
import pandas as pd

from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.cluster import AgglomerativeClustering

#my utils
from utils import  PkdbModel
from circos_utils import substance_cooccurrence_matrix \
    ,study_expand, bubbles_data, find_label, add_label_and_type, create_config_files, substance_combinations \
    ,create_ideogram, create_links

{'token': '4241a1580f26a51f470a716cbb84be132808f5a3'}


In [2]:
%load_ext autoreload
%autoreload 2

## Configuration for Circular Plots

This notebook creates the configuration files for the circular plots.

Tasks performed by the notebook:

 1. Classifies substances by co-occurrence in studies. Derived substances are replaced but their parents already by the database. Label for a class is chosen to be most frequent substance within a class.
 2. Classifies studies into the same classes as substances. The most frequent class of substances used in a study is chosen to be the class of the study.
 3. Builds configuration files for circus plots for:
     1. Circular study plot with bars .  &rightarrow;  /circos/study/
     2. Circular study plot with bubbles.   &rightarrow;  /circos/study1/
     3. Circular plots with subset of study plots for each substance class. &rightarrow;  /circos/substance1/
     4. Circular substance plots with co-occurrence links and bubbles. &rightarrow;  /circos/study_caffeine/ ...
     

## 1. Classifies Substances and Studies 

In [3]:
# load data
studies = PkdbModel("studies", destination="1-preprocessed") 
substances = PkdbModel("substances", destination="1-preprocessed") 
substances.read()
studies.read()

In [4]:
coocurence_matrix = substance_cooccurrence_matrix(studies.data, substances.data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
norm_coocurence_matrix = Normalizer(norm="l2").fit_transform(coocurence_matrix)
norm_coocurence_matrix = pd.DataFrame(norm_coocurence_matrix, index=coocurence_matrix.index, columns=coocurence_matrix.columns)

In [6]:
classification_model = AgglomerativeClustering(linkage="single",n_clusters=4)
classes = classification_model.fit_predict(norm_coocurence_matrix.T.corr())

In [7]:
# Classfied substances into types
substance_to_type = pd.DataFrame(classes, index=norm_coocurence_matrix.index, columns=["type"]).to_dict()["type"]

studies.data, substances.data =  add_label_and_type(studies.data, substances.data, substance_to_type)

In [8]:
studies.data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
126,Loetsch2006,"codeine, codeine-6-glucuronide, norcodeine, mo...",J G,"M K, J G, D E",1,11,"[{'count': 11, 'name': 'all'}]",11,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",2,692,692,132,824,2.0,codeine
59,Balogh1992,caffeine,M K,"M K, J G",1,12,"[{'count': 12, 'name': 'all'}]",12,"['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', ...",1,723,576,72,795,0.0,caffeine
147,Arias1988,"5-dehydrosparteine, sparteine, sparteine sulfa...",J G,"J G, M K",1,142,"[{'count': 142, 'name': 'all'}]",142,"['1', '2', '3', '4', '7', '8', '9', '10', '11'...",1,568,0,0,568,2.0,codeine
88,IKP243,caffeine,M K,M K,1,102,"[{'count': 102, 'name': 'all'}]",102,"['IKP243-1039', 'IKP243-1041', 'IKP243-1043', ...",1,510,0,0,510,0.0,caffeine
112,Stille1987,"caffeine, ofloxacin, ciprofloxacin, enoxacin",M K,M K,1,12,"[{'count': 12, 'name': 'all'}]",12,"['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'K', ...",13,432,0,0,432,0.0,caffeine
139,Yue1991B,"codeine, 4-hydroxydebrisoquine, codeine-6-gluc...",J G,"J G, M K",5,20,"[{'count': 20, 'name': 'all'}, {'count': 3, 'n...",14,"['1_EM', '2_EM', '3_EM', '4_EM', '5_EM', '6_EM...",11,413,83,11,424,2.0,codeine
136,Wu2014,"morphine-3-glucuronide, codeine, morphine-6-gl...",J G,"M K, J G, D E",4,29,"[{'count': 9, 'name': '*10/*10'}, {'count': 10...",208,"['1_1/1_COD', '2_1/1_COD', '3_1/1_COD', '4_1/1...",1,379,96,12,391,2.0,codeine
118,Chen1991,"codeine, codeine-6-glucuronide, dextromethorph...",J G,"M K, J G",3,8,"[{'count': 8, 'name': 'all'}, {'count': 6, 'na...",8,"['6', '7', '8', '1', '2', '3', '4', '5']",9,360,28,4,364,2.0,codeine
137,Vree1992,"codeine, codeine-6-glucuronide, norcodeine, no...",J G,"J G, M K",1,8,"[{'count': 8, 'name': 'all'}]",8,"['1', '2', '3', '4', '5', '6', '7', '8']",1,306,40,14,320,2.0,codeine
99,McLean2002,"theophylline, caffeine, theobromine, paraxanthine",M K,"M K, D E",3,14,"[{'count': 14, 'name': 'all'}, {'count': 8, 'n...",0,,1,262,206,48,310,0.0,caffeine


In [9]:
studies_data = studies.data.sort_values(by=["label","name"])

In [10]:
import numpy as np

In [11]:
studies_data["group_all_count"]= studies_data["group_all_count"].replace(np.nan, 0)
studies_data["group_all_count"]= studies_data["group_all_count"].astype(int)

## 2. Build config files for circular plots

In [12]:
# for study data
create_config_files(studies_data, "circos/study_final")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["start"] = range(0,len(data)*rows_per_study,rows_per_study)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["end"] = range(rows_per_study,(len(data)+1)*rows_per_study,rows_per_study)


In [13]:
studies_data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
49,Abernethy1985,caffeine,M K,"M K, J G",3,18,"[{'count': 18, 'name': 'all'}, {'count': 9, 'n...",54,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",1,64,0,0,64,0.0,caffeine
47,Akinyinka2000,"theophylline, caffeine, theobromine, paraxanthine",M K,"M K, D E, J G",4,28,"[{'count': 28, 'name': 'all'}, {'count': 10, '...",20,"['1C', '2C', '3C', '4C', '5C', '6C', '7C', '8C...",1,164,32,6,170,0.0,caffeine
45,Amchin1999,"caffeine, 17U, AAMU, 1U, venlafaxine, theobrom...",M K,"M K, J G",1,15,"[{'count': 15, 'name': 'all'}]",16,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",2,72,16,2,74,0.0,caffeine
58,Backman2008,"caffeine, paraxanthine",M K,"M K, J G",5,71,"[{'count': 71, 'name': 'all'}, {'count': 11, '...",70,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",1,140,0,0,140,0.0,caffeine
59,Balogh1992,caffeine,M K,"M K, J G",1,12,"[{'count': 12, 'name': 'all'}]",12,"['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', ...",1,723,576,72,795,0.0,caffeine
54,Balogh1995,"caffeine, gestodene, ethinylestradiol, levonor...",M K,"M K, J G",7,20,"[{'count': 10, 'name': 'A'}, {'count': 20, 'na...",40,"['4', '5', '9', '10', '11', '12', '16', '17', ...",4,150,0,0,150,0.0,caffeine
56,Barnet1990,"norfloxacin, caffeine, pipemidic acid",M K,"M K, J G",1,6,"[{'count': 6, 'name': 'all'}]",6,"['1', '2', '3', '4', '5', '6']",3,120,0,0,120,0.0,caffeine
57,Beach1986,"caffeine, disulfiram",M K,M K,5,21,"[{'count': 21, 'name': 'all'}, {'count': 10, '...",21,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",3,280,25,4,284,0.0,caffeine
61,Begas2007,"caffeine, 17U, 1U, AFMU, paraxanthine, 1X",M K,"M K, D E, J G",9,44,"[{'count': 44, 'name': 'all'}, {'count': 21, '...",0,,1,18,0,0,18,0.0,caffeine
60,Benowitz1995,"caffeine, paraxanthine",M K,"M K, J G",3,12,"[{'count': 5, 'name': '2mg'}, {'count': 7, 'na...",0,,4,39,33,8,47,0.0,caffeine


In [14]:
for label, study_data_label in studies_data.groupby("label"):
    create_config_files(study_data_label, f"circos/study_{label}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["start"] = range(0,len(data)*rows_per_study,rows_per_study)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["end"] = range(rows_per_study,(len(data)+1)*rows_per_study,rows_per_study)


In [15]:
# testing for substances 

In [52]:
frames = []
for substance, data in substances.data.sort_values("label").groupby("label"):
    data["start"] = range(len(data)) 
    data["end"] = range(1,len(data)+1) 
    frames.append(data)
substances_data = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [53]:
#substances_data

In [54]:
len(substances_data)

98

In [55]:
unused_substances = substances_data[ (substances_data["study_number"]<2) | ((substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0))]["name"]

In [56]:
substances_data = substances_data[ ~((substances_data["study_number"]<2) | ( (substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0)))]

In [57]:
len(substances_data)

56

In [58]:
unused_substances.to_list()

['venlafaxine',
 'lomefloxacin',
 '1-hydroxymidazolam',
 'exp3174',
 'cimetidine',
 'levonorgestrel',
 'indocyanine green',
 'grapefruit juice',
 'glycerol',
 'gestodene',
 '137MU',
 'ethinylestradiol',
 'disulfiram',
 'digoxin',
 'chlorzoxazone',
 'carbon monoxide',
 'L-ascorbic acid',
 'AFMU',
 '6-hydroxychlorzoxazone',
 '17MU',
 'metropolol',
 'warfarin',
 'normorphine-glucuronide',
 'diclofenac',
 'sulfapyridine',
 'mephenytoin',
 'sulfasalazine',
 'norcodeine-conjugates',
 '2-dehydrosparteine',
 '5-dehydrosparteine',
 '4-hydroxydebrisoquine',
 '[U-13C]glucose',
 'lactate',
 'exenatide',
 '[6-3H]glucose',
 '[3-OMG]glucose',
 '[2-3H]glucose',
 'GLP-1',
 'GIP',
 'propacetamol',
 'propanolol',
 'panadol extend']

In [59]:
all_links = create_links(substances_data,studies_data,unused_substances.to_list())


In [60]:
length = pd.DataFrame()
length["substance_links"] = all_links["substance1_start"].append(all_links["substance2_start"])
length["label"] = all_links["substance1_label"].append(all_links["substance2_label"])

In [61]:
frames = []
for label, data_label in length.groupby(["label"]):
    start = 0 
    
    
    for substance_link, substance_linkpositions in data_label.groupby(["substance_links"]):    
        label_position = pd.Series()
        label_position["start"] = start
        label_position["end"] = start + len(substance_linkpositions)
        label_position["label"]  = label
        label_position["substance_number"]  = substance_link

        frames.append(label_position)
        start = label_position["end"] + 1
res = pd.concat(frames, axis=1).T    
                                                         
                                            

In [62]:
substance_data = pd.merge(res,substances_data[["name","label","start","study_number","timecourse_number","output_number","output_raw_number","output_calculated_number","intervention_number"]],left_on=["label","substance_number"], right_on = ["label","start"])
substance_data["start"] = substance_data["start_x"]

In [63]:
links_new = pd.merge(all_links, substance_data[["start","end","start_y","label"]], left_on=["substance1_label","substance1_start"], right_on = [ "label","start_y"])
links_new[["substance1_start","substance1_end"]] = links_new[["start","end"]]
links_new = pd.merge(links_new, substance_data[["start","end","start_y","label"]], left_on=["substance2_label","substance2_start"], right_on = [ "label","start_y"])
links_new[["substance2_start","substance2_end"]] = links_new[["start_y","end_y"]]


In [64]:
all_links = links_new[["substance1_label","substance1_start" ,"substance1_end", "substance2_label","substance2_start" ,"substance2_end"]]

In [65]:
frames = []
for (label,start), data in all_links.groupby(["substance1_label","substance1_start"]):
    data["substance1_start"] = start + range(len(data))
    data["substance1_end"] = data["substance1_start"] + 1
    data["substance1_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [66]:
frames = []
for (label,end), data in all_links.groupby(["substance2_label","substance2_end"]):
    data["substance2_end"] = end - range(len(data)) 
    data["substance2_start"] = data["substance2_end"] - 1
    data["substance2_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [67]:
all_links.to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [68]:
ideogram = pd.DataFrame(res.sort_values("label").groupby(["label"]).apply(lambda x:  x["end"].max()),columns = ["len"])
ideogram["label"] = ideogram.index

In [69]:
ideogram

Unnamed: 0_level_0,len,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
caffeine,647,caffeine
codeine,1215,codeine
glucose,290,glucose
paracetamol,166,paracetamol


In [70]:
ticks_dict = {}
for label, data in substance_data.groupby("label"):
    mystring = ""
    for x in list(data["start"]):
        mystring += f"{x}u, "
    data["end"].max()
    mystring += f"{data['end'].max()}u"
    ticks_dict[label]=mystring

In [71]:
from pprint import pprint
pprint(ticks_dict)

{'caffeine': '0u, 112u, 126u, 145u, 155u, 174u, 193u, 212u, 231u, 250u, 264u, '
             '273u, 288u, 302u, 319u, 327u, 340u, 435u, 450u, 498u, 518u, '
             '537u, 557u, 574u, 628u, 647u',
 'codeine': '0u, 23u, 105u, 193u, 304u, 423u, 539u, 661u, 802u, 945u, 1065u, '
            '1073u, 1102u, 1159u, 1165u, 1185u, 1200u, 1215u',
 'glucose': '0u, 18u, 70u, 95u, 112u, 137u, 193u, 223u, 266u, 290u',
 'paracetamol': '0u, 35u, 70u, 114u, 136u, 166u'}


In [72]:
directory = "circos/substance_final"
with open(f"{directory}/data/ideogram.txt","w") as f:
        number = 0
        for idx, substance in ideogram.iterrows():

            number = number + 1
            #color_mapping[substance.label] = f"fill_color=spectral-5-div-{number}"
            f.write(f"chr - {substance.label} {substance.label} 0 {substance.len} pastel2-6-qual-{number}\n")


In [73]:
#names 2d track
substance_data["name"] = substance_data["name"].apply(lambda x: x.replace(" ","&nbsp;"))
substance_data[["label","start","end","name"]].to_csv(f"{directory}/data/substance_names.txt", sep=" ",header=False,index=False)
# all subjects number for number track
substance_data[["label","start","end","study_number"]].to_csv(f"{directory}/data/study_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","timecourse_number"]].to_csv(f"{directory}/data/timecourse_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","output_number"]].to_csv(f"{directory}/data/output_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","intervention_number"]].to_csv(f"{directory}/data/intervention_number.txt", sep=" ",header=False,index=False)

bubbles_data_dict = bubbles_data(substance_data,25,"substance")
for name, data in bubbles_data_dict.items():
        data[["label","start","end","type","circle_type"]].to_csv(f"{directory}/data/{name}_bubble.txt", sep=" ",header=False,index=False)


In [74]:
#old stlye write upper part into utlis !

#create_config_files(substances_data, "circos/substance_final", "substance")
#create_links(substances_data,studies_data).to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [39]:
#create_links(substances_data,studies_data)