# Case-Control Finder  
## Finds cases and controls for a given condition within the Sequence Read Archive

__Import dependencies and load data__

In [None]:
%load_ext rpy2.ipython

In [None]:
%%bash
wget https://cran.r-project.org/src/contrib/rjson_0.2.20.tar.gz
R CMD INSTALL rjson_0.2.20.tar.gz

In [None]:
%%R
library(rjson)

In [None]:
import json
import pandas as pd
from functions import *

experiment_to_terms_f_json = './data/experiment_to_terms.json'
term_name_to_id_f = './data/term_name_to_id.json'
experiments_in_hackathon_data_f = './data/experiments_in_hackathon_data.json'
experiment_to_type_f = './data/experiment_to_type.json'
experiment_to_study_f = './data/experiment_to_study.json'
experiment_to_real_value_terms_f = './data/experiment_to_real_value_terms.json'
experiment_to_runs_f = './data/experiment_to_runs.json'

with open(experiment_to_terms_f_json, 'r') as f:
    sample_to_terms = json.load(f)    
with open(term_name_to_id_f, 'r') as f:
    term_name_to_id = json.load(f)
with open(experiments_in_hackathon_data_f, 'r') as f:
    available = set(json.load(f))
with open(experiment_to_type_f, 'r') as f:
    sample_to_type = json.load(f)
with open(experiment_to_study_f, 'r') as f:
    sample_to_study = json.load(f)
with open(experiment_to_real_value_terms_f, 'r') as f:
    sample_to_real_val = json.load(f)
with open(experiment_to_runs_f, 'r') as f:
    sample_to_runs = json.load(f)
    
filter_available = True
if filter_available:
    sample_to_terms = {
        k:v for k,v in sample_to_terms.items()  
        if k in available
    }

In [None]:
%%R
metadata_file_tsv <- read.table(file = "./data/experiment_to_terms.tsv", header = FALSE, sep = "\t")

__1. Type in the term you are looking for (in place of `'glioblastoma multiforme'`)__

In [None]:
term = 'glioblastoma multiforme' ## <-- INPUT HERE

__2. List terms below to remove from control set__  


In the example below, `'disease', 'disease of cellular proliferation'` will be removed from the controls

In [None]:
blacklist_terms = set([
    'disease', 
    'disease of cellular proliferation'
]) ## <-- INPUT HERE

__3. Create case and controls__

In [None]:
case, control = term_to_run(sample_to_terms, term)
ret = match_case_to_controls(term, control, case, sample_to_terms,
    sample_to_study, blacklist_terms, term_name_to_id, sample_to_type,
    filter_poor=True, filter_cell_line=True, filter_differentiated=True,
    sample_to_runs=sample_to_runs, by_run=False)
df = ret[0]
control_confound = ret[1]
case_confound = ret[2]
tissue_intersections = ret[3]

These are the tissues and cell types we could match between cases and controls:

In [None]:
pd.Series(sorted(tissue_intersections))

Here's a sample of some of the cases:

In [None]:
df.loc[(df['condition'] == 'case')].head()

Here's a sample of some of the controls:

In [None]:
df.loc[(df['condition'] == 'control')].head()

__4. Browse other metadata terms that are associated with cases and controls__

Enter whether you want to view cases or controls. Assign the following variable to `True` to view cases or `False` to view controls:

In [None]:
view_cases = False ## <-- INPUT HERE

Enter the tissue or cell type on which to subset your samples:

In [None]:
term = 'blood' ## <-- INPUT HERE

if view_cases:
    condition = 'case'
else:
    condition = 'control'
view_exps = select_case_control_experiment_set(df, condition, term)
with open('./data/term-in.json', 'w') as f:
    json.dump(view_exps, f)

The following plots the proportion of metadata terms for those terms that appear in at least 10% of the samples in the current subset:

In [None]:
%%R
source("./Metadata_plot.R")
bp

In [None]:
%%R
source("./Metadata_table.R")
query_disease_metadata_top10_table

In [None]:
%%R
source("./Metadata_piecharts.R")
query_cell_line

__5. Create output file__

Enter the filename for which you would like to output these cases or controls:

In [None]:
output_file = 'cases_vs_controls.csv' ## <- OUTPUT FILE HERE

df.to_csv(output_file)