In [117]:
import pandas as pd

from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.cluster import AgglomerativeClustering

#my utils
from utils import  PkdbModel
from circos_utils import substance_cooccurrence_matrix \
    ,study_expand, bubbles_data, find_label, add_label_and_type, create_config_files

In [118]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration for Circular Plots

This notebook creates the configuration files for the circular plots.

Tasks performed by the notebook:

 1. Classifies substances by co-occurrence in studies. Derived substances are replaced but their parents already by the database. Label for a class is chosen to be most frequent substance within a class.
 2. Classifies studies into the same classes as substances. The most frequent class of substances used in a study is chosen to be the class of the study.
 3. Builds configuration files for circus plots for:
     1. Circular study plot with bars .  &rightarrow;  /circos/study/
     2. Circular study plot with bubbles.   &rightarrow;  /circos/study1/
     3. Circular plots with subset of study plots for each substance class. &rightarrow;  /circos/substance1/
     4. Circular substance plots with co-occurrence links and bubbles. &rightarrow;  /circos/study_caffeine/ ...
     

## 1. Classifies Substances and Studies 

In [119]:
# load data
studies = PkdbModel("studies", destination="1-preprocessed") 
substances = PkdbModel("substances", destination="1-preprocessed") 
substances.read()
studies.read()

In [120]:
coocurence_matrix = substance_cooccurrence_matrix(studies.data, substances.data)

In [121]:
norm_coocurence_matrix = Normalizer(norm="l2").fit_transform(coocurence_matrix)
norm_coocurence_matrix = pd.DataFrame(norm_coocurence_matrix, index=coocurence_matrix.index, columns=coocurence_matrix.columns)

In [122]:
classification_model = AgglomerativeClustering(linkage="single",n_clusters=4)
classes = classification_model.fit_predict(norm_coocurence_matrix.T.corr())

In [123]:
# Classfied substances into types
substance_to_type = pd.DataFrame(classes, index=norm_coocurence_matrix.index, columns=["type"]).to_dict()["type"]

studies.data, substances.data =  add_label_and_type(studies.data, substances.data, substance_to_type)

In [124]:
studies.data

Unnamed: 0,name,substances,creator,curators,groups_count,group_all_count,groups,individuals_count,individuals,interventions_count,outputs_count,outputs_calculated_count,timecourses_count,results_count,type,label
10,Balogh1992,caffeine,M K,"M K, J G",1,12.0,"[{'count': 12, 'name': 'all'}]",12,"['M', 'L', 'K', 'I', 'H', 'G', 'F', 'E', 'D', ...",1,651,504,72,723,0.0,caffeine
38,Loetsch2006,"morphine, morphine-6-glucuronide, normorphine,...",J G,"M K, J G",1,11.0,"[{'count': 11, 'name': 'all'}]",11,"['11', '10', '9', '8', '7', '6', '5', '4', '3'...",2,546,546,132,678,2.0,codeine
17,Test243,caffeine,M K,M K,1,102.0,"[{'count': 102, 'name': 'all'}]",102,"['IKP243-2079', 'IKP243-2075', 'IKP243-2074', ...",1,510,0,0,510,0.0,caffeine
69,Stille1987,"caffeine, ciprofloxacin, ofloxacin, enoxacin",M K,M K,1,12.0,"[{'count': 12, 'name': 'all'}]",12,"['N', 'M', 'L', 'K', 'I', 'H', 'G', 'F', 'E', ...",13,432,0,0,432,0.0,caffeine
55,Yue1991B,"morphine, morphine-3-glucuronide, morphine-6-g...",J G,J G,5,20.0,"[{'count': 20, 'name': 'all'}, {'count': 8, 'n...",14,"['14_PM', '13_PM', '12_PM', '11_PM', '10_PM', ...",11,398,72,11,409,2.0,codeine
42,Chen1991,"morphine, morphine-3-glucuronide, morphine-6-g...",J G,"M K, J G",3,8.0,"[{'count': 8, 'name': 'all'}, {'count': 2, 'na...",8,"['8', '7', '6', '5', '4', '3', '2', '1']",8,344,24,4,348,2.0,codeine
61,Vree1992,"morphine, morphine-3-glucuronide, morphine-6-g...",J G,J G,1,8.0,"[{'count': 8, 'name': 'all'}]",8,"['8', '7', '6', '5', '4', '3', '2', '1']",1,301,35,14,315,2.0,codeine
54,Yue1989-2,"morphine, morphine-3-glucuronide, morphine-6-g...",J G,J G,3,132.0,"[{'count': 132, 'name': 'all'}, {'count': 18, ...",132,"['132_EM', '131_EM', '130_EM', '129_EM', '128_...",2,299,0,0,299,2.0,codeine
32,Beach1986,"caffeine, disulfiram",M K,M K,3,21.0,"[{'count': 21, 'name': 'all'}, {'count': 10, '...",21,"['K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', ...",3,254,19,4,258,0.0,caffeine
21,McLean2002,"caffeine, paraxanthine, theophylline, theobromine",M K,M K,3,14.0,"[{'count': 14, 'name': 'all'}, {'count': 8, 'n...",0,,1,190,134,48,238,0.0,caffeine


In [127]:
studies_data = studies.data.sort_values(by=["label","name"])

## 2. Build config files for circular plots

In [130]:
# for study data
create_config_files(studies_data, "circos/study_final")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  studies_data[["label","start","end","name"]].to_csv(f"{directory}/data/study_names.txt", sep=" ",header=False,index=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # all subjects number for number track


In [129]:
for label, study_data_label in studies_data.groupby("label"):
    
    create_config_files(study_data_label, f"circos/study_{label}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  #names 2d track
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  studies_data[["label","start","end","name"]].to_csv(f"{directory}/data/study_names.txt", sep=" ",header=False,index=False)
