In [1]:
import pandas as pd

from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.cluster import AgglomerativeClustering

#my utils
from utils import  PkdbModel
from circos_utils import substance_cooccurrence_matrix \
    ,study_expand, bubbles_data, find_label, add_label_and_type, create_config_files, substance_combinations \
    ,create_ideogram, create_links

{'token': 'a94a309b0392aff5ed02b52a25ba44d7abd41aa6'}


In [2]:
%load_ext autoreload
%autoreload 2

## Configuration for Circular Plots

This notebook creates the configuration files for the circular plots.

Tasks performed by the notebook:

 1. Classifies substances by co-occurrence in studies. Derived substances are replaced but their parents already by the database. Label for a class is chosen to be most frequent substance within a class.
 2. Classifies studies into the same classes as substances. The most frequent class of substances used in a study is chosen to be the class of the study.
 3. Builds configuration files for circus plots for:
     1. Circular study plot with bars .  &rightarrow;  /circos/study/
     2. Circular study plot with bubbles.   &rightarrow;  /circos/study1/
     3. Circular plots with subset of study plots for each substance class. &rightarrow;  /circos/substance1/
     4. Circular substance plots with co-occurrence links and bubbles. &rightarrow;  /circos/study_caffeine/ ...
     

## 1. Classifies Substances and Studies 

In [3]:
# load data
studies = PkdbModel("studies", destination="1-preprocessed") 
substances = PkdbModel("substances", destination="1-preprocessed") 
substances.read()
studies.read()

In [4]:
coocurence_matrix = substance_cooccurrence_matrix(studies.data, substances.data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
norm_coocurence_matrix = Normalizer(norm="l2").fit_transform(coocurence_matrix)
norm_coocurence_matrix = pd.DataFrame(norm_coocurence_matrix, index=coocurence_matrix.index, columns=coocurence_matrix.columns)

In [6]:
classification_model = AgglomerativeClustering(linkage="single",n_clusters=4)
classes = classification_model.fit_predict(norm_coocurence_matrix.T.corr())

In [7]:
# Classfied substances into types
substance_to_type = pd.DataFrame(classes, index=norm_coocurence_matrix.index, columns=["type"]).to_dict()["type"]

studies.data, substances.data =  add_label_and_type(studies.data, substances.data, substance_to_type)

In [8]:
studies.data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
67,Loetsch2006,"normorphine, morphine-6-glucuronide, morphine ...",J G,"M K, J G, D E",1,11,"[{'count': 11, 'name': 'all'}]",11,"['11', '10', '9', '8', '7', '6', '5', '4', '3'...",2,691,691,132,823,2.0,codeine
157,Arias1988,"2-dehydrosparteine, sparteine, sparteine sulfa...",J G,"J G, M K",1,142,"[{'count': 142, 'name': 'all'}]",142,"['203', '202', '201', '200', '199', '198', '19...",1,568,0,0,568,2.0,codeine
54,IKP243,caffeine,M K,M K,1,102,"[{'count': 102, 'name': 'all'}]",102,"['IKP243-2079', 'IKP243-2075', 'IKP243-2074', ...",1,510,0,0,510,0.0,caffeine
33,Balogh1992,caffeine,M K,"M K, J G",1,12,"[{'count': 12, 'name': 'all'}]",12,"['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', ...",1,363,216,72,435,0.0,caffeine
58,Stille1987,"ciprofloxacin, ofloxacin, enoxacin, caffeine",M K,M K,1,12,"[{'count': 12, 'name': 'all'}]",12,"['N', 'M', 'L', 'K', 'I', 'H', 'G', 'F', 'E', ...",13,432,0,0,432,0.0,caffeine
76,Yue1991B,"normorphine, morphine-3-glucuronide, morphine-...",J G,"J G, M K",5,20,"[{'count': 20, 'name': 'all'}, {'count': 8, 'n...",14,"['14_PM', '13_PM', '12_PM', '11_PM', '10_PM', ...",11,413,83,11,424,2.0,codeine
77,Wu2014,"morphine-3-glucuronide, codeine, morphine, mor...",J G,"M K, J G, D E",4,29,"[{'count': 29, 'name': 'all'}, {'count': 10, '...",208,"['8_10/10_M6G', '7_10/10_M6G', '6_10/10_M6G', ...",1,379,96,12,391,2.0,codeine
143,Chen1991,"norcodeine-conjugates, morphine-3-glucuronide,...",J G,"M K, J G, D E",3,8,"[{'count': 8, 'name': 'all'}, {'count': 2, 'na...",8,"['8', '7', '6', '5', '4', '3', '2', '1']",9,360,28,4,364,2.0,codeine
75,Vree1992,"normorphine, morphine-3-glucuronide, morphine-...",J G,"J G, M K",1,8,"[{'count': 8, 'name': 'all'}]",8,"['8', '7', '6', '5', '4', '3', '2', '1']",1,306,40,14,320,2.0,codeine
81,Yue1989-2,"normorphine, morphine-3-glucuronide, morphine-...",J G,"J G, M K",3,132,"[{'count': 132, 'name': 'all'}, {'count': 18, ...",132,"['132_EM', '131_EM', '130_EM', '129_EM', '128_...",2,301,0,0,301,2.0,codeine


In [9]:
studies_data = studies.data.sort_values(by=["label","name"])

In [10]:
import numpy as np

In [11]:
studies_data["group_all_count"]= studies_data["group_all_count"].replace(np.nan, 0)
studies_data["group_all_count"]= studies_data["group_all_count"].astype(int)

## 2. Build config files for circular plots

In [12]:
# for study data
create_config_files(studies_data, "circos/study_final")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["start"] = range(0,len(data)*rows_per_study,rows_per_study)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data["end"] = range(rows_per_study,(len(data)+1)*rows_per_study,rows_per_study)


In [13]:
studies_data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
34,Abernethy1985,caffeine,M K,"M K, J G",3,18,"[{'count': 18, 'name': 'all'}, {'count': 9, 'n...",54,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",1,64,0,0,64,0.0,caffeine
32,Akinyinka2000,"theophylline, caffeine, theobromine, paraxanthine",M K,"M K, D E, J G",4,28,"[{'count': 28, 'name': 'all'}, {'count': 10, '...",20,"['1C', '2C', '3C', '4C', '5C', '6C', '7C', '8C...",1,164,32,6,170,0.0,caffeine
29,Amchin1999,"paraxanthine, 17U, 1X, AAMU, 1U, caffeine, ven...",M K,"M K, J G",1,15,"[{'count': 15, 'name': 'all'}]",16,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",2,72,16,2,74,0.0,caffeine
30,Backman2008,"caffeine, paraxanthine",M K,"M K, J G",5,71,"[{'count': 71, 'name': 'all'}, {'count': 53, '...",70,"['1', '2', '3', '4', '5', '6', '7', '8', '9', ...",1,140,0,0,140,0.0,caffeine
33,Balogh1992,caffeine,M K,"M K, J G",1,12,"[{'count': 12, 'name': 'all'}]",12,"['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', ...",1,363,216,72,435,0.0,caffeine
28,Balogh1995,"levonorgestrel, ethinylestradiol, caffeine, ge...",M K,"M K, J G",7,20,"[{'count': 20, 'name': 'all'}, {'count': 10, '...",40,"['4', '5', '9', '10', '11', '12', '16', '17', ...",4,150,0,0,150,0.0,caffeine
113,Barnet1990,"pipemidic acid, caffeine, norfloxacin",M K,"M K, J G",1,6,"[{'count': 6, 'name': 'all'}]",6,"['1', '2', '3', '4', '5', '6']",3,120,0,0,120,0.0,caffeine
112,Beach1986,"caffeine, disulfiram",M K,M K,5,21,"[{'count': 21, 'name': 'all'}, {'count': 10, '...",21,"['K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', ...",3,281,26,4,285,0.0,caffeine
116,Begas2007,"paraxanthine, 17U, AFMU, 1U, caffeine, 1X",M K,"M K, D E, J G",9,44,"[{'count': 44, 'name': 'all'}, {'count': 21, '...",0,,1,18,0,0,18,0.0,caffeine
117,Benowitz1995,"caffeine, paraxanthine",M K,"M K, J G",3,12,"[{'count': 12, 'name': 'all'}, {'count': 5, 'n...",0,,4,34,28,8,42,0.0,caffeine


In [14]:
for label, study_data_label in studies_data.groupby("label"):
    create_config_files(study_data_label, f"circos/study_{label}")


In [15]:
# testing for substances 

In [16]:
frames = []
for substance, data in substances.data.sort_values("label").groupby("label"):
    data["start"] = range(len(data)) 
    data["end"] = range(1,len(data)+1) 
    frames.append(data)
substances_data = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [17]:
#substances_data

In [18]:
len(substances_data)

106

In [19]:
unused_substances = substances_data[ (substances_data["study_number"]<2) | ((substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0))]["name"]

In [20]:
substances_data = substances_data[ ~((substances_data["study_number"]<2) | ( (substances_data["intervention_number"]==0) & (substances_data["timecourse_number"] == 0)  & (substances_data["output_number"] == 0)))]

In [21]:
len(substances_data)

61

In [22]:
list(unused_substances)

['AFMU',
 'glycerol',
 '137MU',
 'silexan',
 'carbon monoxide',
 'chlorzoxazone',
 'digoxin',
 'disulfiram',
 'ethinylestradiol',
 'metropolol',
 'lomefloxacin',
 'levonorgestrel',
 'indocyanine green',
 'gestodene',
 'grapefruit juice',
 'L-ascorbic acid',
 'venlafaxine',
 'tolbutamide',
 '17MU',
 '6-hydroxychlorzoxazone',
 '4-hydroxydebrisoquine',
 'mephenytoin',
 'sulfapyridine',
 'sulfasalazine',
 '5-dehydrosparteine',
 'normorphine-glucuronide',
 '2-dehydrosparteine',
 'norcodeine-conjugates',
 'diclofenac',
 '[6-3H]glucose',
 '[U-13C]glucose',
 'exenatide',
 '[3-OMG]glucose',
 'lactate',
 '[2-3H]glucose',
 'GLP-1',
 'GIP',
 'nizatidine',
 'propanolol',
 'propacetamol',
 'sorbitol',
 'imatinib',
 'sulfinpyrazone',
 'panadol extend',
 'watercress']

In [23]:
all_links = create_links(substances_data,studies_data,list(unused_substances))


In [24]:
length = pd.DataFrame()
length["substance_links"] = all_links["substance1_start"].append(all_links["substance2_start"])
length["label"] = all_links["substance1_label"].append(all_links["substance2_label"])

In [25]:
frames = []
for label, data_label in length.groupby(["label"]):
    start = 0 
    
    
    for substance_link, substance_linkpositions in data_label.groupby(["substance_links"]):    
        label_position = pd.Series()
        label_position["start"] = start
        label_position["end"] = start + len(substance_linkpositions)
        label_position["label"]  = label
        label_position["substance_number"]  = substance_link

        frames.append(label_position)
        start = label_position["end"] + 1
res = pd.concat(frames, axis=1).T    
                                                         
                                            

In [36]:
substances_data["start"]

29      0
16      1
10      2
5       3
50      4
15      6
2       7
6       8
17      9
104    11
4      12
63     13
45     14
13     15
49     16
79     17
76     36
0      37
101    38
80     39
40     40
8      41
7      42
89     43
41     44
88     45
66     46
33     47
100    48
77      0
       ..
74      4
37      5
67      7
70      9
69     10
39     11
36     12
95     15
96     16
68     18
72     19
34     22
92     23
59      1
53      4
56      7
52      8
75      9
46     11
19     12
38     15
28     16
27      0
82      1
85      2
86      4
32      5
87      6
84     11
83     14
Name: start, Length: 61, dtype: int64

In [39]:
res["substance_number"] = res["substance_number"].astype(int)

In [40]:
substance_data = pd.merge(res,substances_data[["name","label","start","study_number","timecourse_number","output_number","output_raw_number","output_calculated_number","intervention_number"]],left_on=["label","substance_number"], right_on = ["label","start"])
substance_data["start"] = substance_data["start_x"]

In [41]:
links_new = pd.merge(all_links, substance_data[["start","end","start_y","label"]], left_on=["substance1_label","substance1_start"], right_on = [ "label","start_y"])
links_new[["substance1_start","substance1_end"]] = links_new[["start","end"]]
links_new = pd.merge(links_new, substance_data[["start","end","start_y","label"]], left_on=["substance2_label","substance2_start"], right_on = [ "label","start_y"])
links_new[["substance2_start","substance2_end"]] = links_new[["start_y","end_y"]]


In [42]:
all_links = links_new[["substance1_label","substance1_start" ,"substance1_end", "substance2_label","substance2_start" ,"substance2_end"]]

In [43]:
frames = []
for (label,start), data in all_links.groupby(["substance1_label","substance1_start"]):
    data["substance1_start"] = start + range(len(data))
    data["substance1_end"] = data["substance1_start"] + 1
    data["substance1_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [44]:
frames = []
for (label,end), data in all_links.groupby(["substance2_label","substance2_end"]):
    data["substance2_end"] = end - range(len(data)) 
    data["substance2_start"] = data["substance2_end"] - 1
    data["substance2_label"] = label

    frames.append(data)
    
all_links = pd.concat(frames)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [45]:
all_links.to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [46]:
ideogram = pd.DataFrame(res.sort_values("label").groupby(["label"]).apply(lambda x:  x["end"].max()),columns = ["len"])
ideogram["label"] = ideogram.index

In [47]:
ideogram

Unnamed: 0_level_0,len,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
caffeine,873,caffeine
codeine,1163,codeine
glucose,302,glucose
paracetamol,247,paracetamol


In [48]:
ticks_dict = {}
for label, data in substance_data.groupby("label"):
    mystring = ""
    for x in list(data["start"]):
        mystring += f"{x}u, "
    data["end"].max()
    mystring += f"{data['end'].max()}u"
    ticks_dict[label]=mystring

In [49]:
from pprint import pprint
pprint(ticks_dict)

{'caffeine': '0u, 134u, 153u, 172u, 191u, 200u, 219u, 238u, 257u, 267u, 276u, '
             '291u, 316u, 331u, 356u, 375u, 390u, 410u, 429u, 477u, 513u, '
             '560u, 584u, 602u, 622u, 656u, 765u, 803u, 820u, 873u',
 'codeine': '0u, 88u, 210u, 292u, 412u, 553u, 672u, 788u, 845u, 851u, 871u, '
            '887u, 895u, 1006u, 1149u, 1163u',
 'glucose': '0u, 55u, 114u, 139u, 185u, 203u, 228u, 245u, 270u, 302u',
 'paracetamol': '0u, 8u, 69u, 77u, 103u, 109u, 161u, 213u, 247u'}


In [50]:
directory = "circos/substance_final"
with open(f"{directory}/data/ideogram.txt","w") as f:
        number = 0
        for idx, substance in ideogram.iterrows():

            number = number + 1
            #color_mapping[substance.label] = f"fill_color=spectral-5-div-{number}"
            f.write(f"chr - {substance.label} {substance.label} 0 {substance.len} pastel2-6-qual-{number}\n")


In [51]:
#names 2d track
substance_data["name"] = substance_data["name"].apply(lambda x: x.replace(" ","&nbsp;"))
substance_data[["label","start","end","name"]].to_csv(f"{directory}/data/substance_names.txt", sep=" ",header=False,index=False)
# all subjects number for number track
substance_data[["label","start","end","study_number"]].to_csv(f"{directory}/data/study_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","timecourse_number"]].to_csv(f"{directory}/data/timecourse_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","output_number"]].to_csv(f"{directory}/data/output_number.txt", sep=" ",header=False,index=False)
substance_data[["label","start","end","intervention_number"]].to_csv(f"{directory}/data/intervention_number.txt", sep=" ",header=False,index=False)

bubbles_data_dict = bubbles_data(substance_data,25,"substance")
for name, data in bubbles_data_dict.items():
        data[["label","start","end","type","circle_type"]].to_csv(f"{directory}/data/{name}_bubble.txt", sep=" ",header=False,index=False)


In [None]:
#old stlye write upper part into utlis !

#create_config_files(substances_data, "circos/substance_final", "substance")
#create_links(substances_data,studies_data).to_csv("circos/substance_final/data/links.txt", sep=" ",header=False,index=False)

In [None]:
#create_links(substances_data,studies_data)