In [446]:
import pandas as pd

from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.cluster import AgglomerativeClustering

#my utils
from utils import  PkdbModel
from circos_utils import substance_cooccurrence_matrix \
    ,study_expand, bubbles_data, find_label, add_label_and_type, create_config_files, substance_combinations \
    ,create_ideogram, create_links

In [447]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration for Circular Plots

This notebook creates the configuration files for the circular plots.

Tasks performed by the notebook:

 1. Classifies substances by co-occurrence in studies. Derived substances are replaced but their parents already by the database. Label for a class is chosen to be most frequent substance within a class.
 2. Classifies studies into the same classes as substances. The most frequent class of substances used in a study is chosen to be the class of the study.
 3. Builds configuration files for circus plots for:
     1. Circular study plot with bars .  &rightarrow;  /circos/study/
     2. Circular study plot with bubbles.   &rightarrow;  /circos/study1/
     3. Circular plots with subset of study plots for each substance class. &rightarrow;  /circos/substance1/
     4. Circular substance plots with co-occurrence links and bubbles. &rightarrow;  /circos/study_caffeine/ ...
     

## 1. Classifies Substances and Studies 

In [448]:
# load data
studies = PkdbModel("studies", destination="1-preprocessed") 
substances = PkdbModel("substances", destination="1-preprocessed") 
substances.read()
studies.read()

In [449]:
coocurence_matrix = substance_cooccurrence_matrix(studies.data, substances.data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [450]:
norm_coocurence_matrix = Normalizer(norm="l2").fit_transform(coocurence_matrix)
norm_coocurence_matrix = pd.DataFrame(norm_coocurence_matrix, index=coocurence_matrix.index, columns=coocurence_matrix.columns)

In [451]:
classification_model = AgglomerativeClustering(linkage="single",n_clusters=4)
classes = classification_model.fit_predict(norm_coocurence_matrix.T.corr())

In [452]:
# Classfied substances into types
substance_to_type = pd.DataFrame(classes, index=norm_coocurence_matrix.index, columns=["type"]).to_dict()["type"]

studies.data, substances.data =  add_label_and_type(studies.data, substances.data, substance_to_type)

In [453]:
studies.data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
67,Balogh1992,caffeine,M K,"M K, J G",1,12.0,"[{'count': 12, 'name': 'all'}]",12,"['M', 'L', 'K', 'I', 'H', 'G', 'F', 'E', 'D', ...",1,651,504,72,723,0.0,caffeine
10,Loetsch2006,codeine,J G,"J G, M K",1,11.0,"[{'count': 11, 'name': 'all'}]",11,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",2,534,534,132,666,1.0,codeine
9,Arias1988,sparteine,J G,J G,1,142.0,"[{'count': 142, 'name': 'all'}]",142,"['1', '2', '3', '4', '7', '8', '9', '10', '11'...",1,568,0,0,568,1.0,codeine
110,Test243,"caffeine, paraxanthine",M K,M K,1,102.0,"[{'count': 102, 'name': 'all'}]",102,"['IKP243-2079', 'IKP243-2075', 'IKP243-2074', ...",1,510,0,0,510,0.0,caffeine
108,Stille1987,,M K,M K,1,12.0,"[{'count': 12, 'name': 'all'}]",12,"['N', 'M', 'L', 'K', 'I', 'H', 'G', 'F', 'E', ...",13,432,0,0,432,,
22,Yue1991B,"debrisoquine, codeine",J G,J G,5,20.0,"[{'count': 20, 'name': 'all'}, {'count': 8, 'n...",14,"['1_EM', '2_EM', '3_EM', '4_EM', '5_EM', '6_EM...",11,398,72,11,409,1.0,codeine
1,Chen1991,codeine,J G,"J G, M K",3,8.0,"[{'count': 6, 'name': 'EM'}, {'count': 2, 'nam...",8,"['1', '2', '3', '4', '5', '6', '7', '8']",8,344,24,4,348,1.0,codeine
20,Vree1992,codeine,J G,J G,1,8.0,"[{'count': 8, 'name': 'all'}]",8,"['1', '2', '3', '4', '5', '6', '7', '8']",1,301,35,14,315,1.0,codeine
26,Yue1989-2,"debrisoquine, codeine",J G,J G,3,132.0,"[{'count': 132, 'name': 'all'}, {'count': 18, ...",132,"['1_PM', '2_PM', '3_PM', '4_PM', '5_PM', '6_PM...",2,299,0,0,299,1.0,codeine
66,Beach1986,,M K,M K,3,21.0,"[{'count': 21, 'name': 'all'}, {'count': 10, '...",21,"['K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', ...",3,254,19,4,258,,


In [454]:
studies_data = studies.data.sort_values(by=["label","name"])

In [455]:
import numpy as np

In [456]:
studies_data["group_all_count"]= studies_data["group_all_count"].replace(np.nan, 0)
studies_data["group_all_count"]= studies_data["group_all_count"].astype(int)

## 2. Build config files for circular plots

In [457]:
# for study data
create_config_files(studies_data, "circos/study_final")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["start"] = range(0,len(data)*rows_per_study,rows_per_study)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["end"] = range(rows_per_study,(len(data)+1)*rows_per_study,rows_per_study)


In [458]:
studies_data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
60,Abernethy1985,caffeine,M K,"M K, J G",3,18,"[{'count': 18, 'name': 'all'}, {'count': 9, 'n...",54,"['54', '53', '52', '51', '50', '49', '48', '47...",1,64,0,0,64,0.0,caffeine
57,Akinyinka2000,caffeine,M K,"M K, J G",4,28,"[{'count': 28, 'name': 'all'}, {'count': 10, '...",20,"['10M', '9M', '8M', '7M', '6M', '5M', '4M', '3...",1,160,28,6,166,0.0,caffeine
55,Backman2008,"caffeine, tizanidine",M K,"M K, J G",5,71,"[{'count': 71, 'name': 'all'}, {'count': 53, '...",70,"['70', '69', '68', '67', '66', '65', '64', '63...",1,140,0,0,140,0.0,caffeine
67,Balogh1992,caffeine,M K,"M K, J G",1,12,"[{'count': 12, 'name': 'all'}]",12,"['M', 'L', 'K', 'I', 'H', 'G', 'F', 'E', 'D', ...",1,651,504,72,723,0.0,caffeine
63,Balogh1995,caffeine,M K,"M K, J G",7,20,"[{'count': 20, 'name': 'all'}, {'count': 10, '...",40,"['18_oc', '15_oc', '14_oc', '13_oc', '8_oc', '...",4,150,0,0,150,0.0,caffeine
64,Barnet1990,"caffeine, quinolone, norfloxacin, pipemidic acid",M K,"M K, J G",1,6,"[{'count': 6, 'name': 'all'}]",6,"['6', '5', '4', '3', '2', '1']",3,120,0,0,120,0.0,caffeine
70,Begas2007,caffeine,M K,"M K, J G",9,44,"[{'count': 44, 'name': 'all'}, {'count': 21, '...",0,,1,18,0,0,18,0.0,caffeine
69,Benowitz1995,"caffeine, paraxanthine",M K,"M K, J G",3,12,"[{'count': 12, 'name': 'all'}, {'count': 5, 'n...",0,,4,33,27,8,41,0.0,caffeine
65,Benowitz2003,"caffeine, 6-hydroxychlorzoxazone, chlorzoxazone",M K,"M K, J G",1,12,"[{'count': 12, 'name': 'all'}]",20,"['A9', 'A8', 'A7', 'A6', 'A5', 'A4', 'A3', 'A2...",4,144,42,6,150,0.0,caffeine
68,Blanchard1983a,caffeine,M K,"M K, J G",1,10,"[{'count': 10, 'name': 'all'}]",0,,2,21,14,2,23,0.0,caffeine


In [459]:
for label, study_data_label in studies_data.groupby("label"):
    create_config_files(study_data_label, f"circos/study_{label}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["start"] = range(0,len(data)*rows_per_study,rows_per_study)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["end"] = range(rows_per_study,(len(data)+1)*rows_per_study,rows_per_study)


In [460]:
# testing for substances 

In [461]:
frames = []
for substance, data in substances.data.sort_values("label").groupby("label"):
    data["start"] = range(len(data)) 
    data["end"] = range(1,len(data)+1) 
    frames.append(data)
substances_data = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [462]:
#substances_data

In [463]:
len(substances_data)

64

In [464]:
unused_substances = substances_data[ ((substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0))]["name"]

In [465]:
substances_data = substances_data[ ~((substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0))]

In [466]:
len(substances_data)

52

In [467]:
unused_substances.to_list()

['ephedrine',
 'clozapine',
 'carbamazepine',
 'naringenin',
 'AAMU',
 '1U',
 '1X',
 'pseudoephedrine',
 'tizanidine',
 'venlafaxine',
 'quinolone',
 '4-hydroxydebrisoquine']

In [468]:
all_links = create_links(substances_data,studies_data,unused_substances.to_list())


In [469]:
length = pd.DataFrame()
length["substance_links"] = all_links["substance1_start"].append(all_links["substance2_start"])
length["label"] = all_links["substance1_label"].append(all_links["substance2_label"])

In [470]:
frames = []
for label, data_label in length.groupby(["label"]):
    start = 0 
    
    
    for substance_link, substance_linkpositions in data_label.groupby(["substance_links"]):    
        label_position = pd.Series()
        label_position["start"] = start
        label_position["end"] = start + len(substance_linkpositions)
        label_position["label"]  = label
        label_position["substance_number"]  = substance_link

        frames.append(label_position)
        start = label_position["end"] + 1
res = pd.concat(frames, axis=1).T    
                                                         
                                            

In [471]:
substance_data = pd.merge(res,substances_data[["name","label","start","study_number","timecourse_number","output_number","output_raw_number","output_calculated_number","intervention_number"]],left_on=["label","substance_number"], right_on = ["label","start"])
substance_data["start"] = substance_data["start_x"]

In [472]:
links_new = pd.merge(all_links, substance_data[["start","end","start_y","label"]], left_on=["substance1_label","substance1_start"], right_on = [ "label","start_y"])
links_new[["substance1_start","substance1_end"]] = links_new[["start","end"]]
links_new = pd.merge(links_new, substance_data[["start","end","start_y","label"]], left_on=["substance2_label","substance2_start"], right_on = [ "label","start_y"])
links_new[["substance2_start","substance2_end"]] = links_new[["start_y","end_y"]]


In [473]:
all_links = links_new[["substance1_label","substance1_start" ,"substance1_end", "substance2_label","substance2_start" ,"substance2_end"]]

In [474]:
frames = []
for (label,start), data in all_links.groupby(["substance1_label","substance1_start"]):
    data["substance1_start"] = start + range(len(data))
    data["substance1_end"] = data["substance1_start"] + 1
    data["substance1_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [475]:
frames = []
for (label,end), data in all_links.groupby(["substance2_label","substance2_end"]):
    data["substance2_end"] = end - range(len(data)) 
    data["substance2_start"] = data["substance2_end"] - 1
    data["substance2_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [476]:
all_links.to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [477]:
ideogram = pd.DataFrame(res.sort_values("label").groupby(["label"]).apply(lambda x:  x["end"].max()),columns = ["len"])
ideogram["label"] = ideogram.index

In [478]:
ideogram

Unnamed: 0_level_0,len,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
caffeine,161,caffeine
codeine,273,codeine
glucose,246,glucose
paracetamol,60,paracetamol


In [479]:
ticks_dict = {}
for label, data in substance_data.groupby("label"):
    mystring = ""
    for x in list(data["start"]):
        mystring += f"{x}u, "
    data["end"].max()
    mystring += f"{data['end'].max()}u"
    ticks_dict[label]=mystring

In [480]:
from pprint import pprint
pprint(ticks_dict)

{'caffeine': '0u, 42u, 51u, 56u, 61u, 65u, 69u, 71u, 73u, 75u, 78u, 81u, 84u, '
             '93u, 102u, 112u, 135u, 140u, 153u, 161u',
 'codeine': '0u, 22u, 53u, 65u, 76u, 94u, 105u, 127u, 140u, 142u, 164u, 186u, '
            '208u, 230u, 252u, 273u',
 'glucose': '0u, 19u, 66u, 109u, 142u, 161u, 182u, 196u, 208u, 214u, 220u, '
            '228u, 246u',
 'paracetamol': '0u, 13u, 29u, 32u, 37u, 48u, 60u'}


In [481]:
directory = "circos/substance_final"
with open(f"{directory}/data/ideogram.txt","w") as f:
        number = 0
        for idx, substance in ideogram.iterrows():

            number = number + 1
            #color_mapping[substance.label] = f"fill_color=spectral-5-div-{number}"
            f.write(f"chr - {substance.label} {substance.label} 0 {substance.len} pastel2-6-qual-{number}\n")


In [482]:
#names 2d track
substance_data["name"] = substance_data["name"].apply(lambda x: x.replace(" ","&nbsp;"))
substance_data[["label","start","end","name"]].to_csv(f"{directory}/data/substance_names.txt", sep=" ",header=False,index=False)
# all subjects number for number track
substance_data[["label","start","end","study_number"]].to_csv(f"{directory}/data/study_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","timecourse_number"]].to_csv(f"{directory}/data/timecourse_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","output_number"]].to_csv(f"{directory}/data/output_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","intervention_number"]].to_csv(f"{directory}/data/intervention_number.txt", sep=" ",header=False,index=False)

bubbles_data_dict = bubbles_data(substance_data,25,"substance")
for name, data in bubbles_data_dict.items():
        data[["label","start","end","type","circle_type"]].to_csv(f"{directory}/data/{name}_bubble.txt", sep=" ",header=False,index=False)


In [483]:
#old stlye write upper part into utlis !

#create_config_files(substances_data, "circos/substance_final", "substance")
#create_links(substances_data,studies_data).to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [42]:
#create_links(substances_data,studies_data)