In [1]:
import sqlalchemy as sa
import pandas as pd
import numpy as np

In [69]:
DATABASE_URL = 'postgresql://meditreats:meditreats@df-treats-db.cs6hxh6ocizm.us-west-2.rds.amazonaws.com:5432'

engine = sa.create_engine(DATABASE_URL)
conn = engine.connect()





In [3]:
measures = pd.read_sql_query("""
    SELECT * from measures
""", conn)

groups = pd.read_sql_query("""
    SELECT groups.id as group_id, groups.title as title, groups.study as study,
           string_agg(cast(treatments.id as varchar), ', ') as treatments
    FROM groups
    JOIN administrations ON groups.id = administrations.group
    JOIN treatments ON treatments.id = administrations.treatment
    GROUP BY groups.id
""", conn)


In [4]:
from itertools import combinations

diff_df = {
    'study': [],
    'bases': [],
    'diffs': [],
    'group_pairs': []
}

def pairwise_comparison(group):
    row_pairs = list(combinations(group.itertuples(), 2))
    for row1, row2 in row_pairs:
        # Perform the comparison here
        treatment1 = row1[11]
        treatment2 = row2[11]
        
        study = row1[2]
        
        group1 = row1[9]
        group2 = row2[9]
        
        diff = list(set(treatment1.split(',')) - set(treatment2.split(','))) + list(set(treatment2.split(',')) - set(treatment1.split(',')))
        base = list(set(treatment1.split(',')) & set(treatment2.split(',')))
        
        diff_df['bases'].append(base)
        diff_df['diffs'].append(diff)
        diff_df['group_pairs'].append((group1, group2))
        diff_df['study'].append(study)
        

measures.merge(groups, on='study').groupby("id").apply(pairwise_comparison)

In [5]:
diffs = pd.DataFrame.from_dict(diff_df)

In [6]:
diffs

Unnamed: 0,study,bases,diffs,group_pairs
0,NCT03266419,[266],[],"(35418, 35419)"
1,NCT03266419,[266],[],"(35418, 35419)"
2,NCT03260894,[],"[5787, 65, 2722]","(35421, 35422)"
3,NCT03260894,[],"[5787, 65, 2722]","(35421, 35422)"
4,NCT03260894,[],"[5787, 65, 2722]","(35421, 35422)"
...,...,...,...,...
7301594,NCT00352144,[],"[2182, 817]","(98187, 98188)"
7301595,NCT00352144,[],"[2182, 817]","(98187, 98188)"
7301596,NCT00352144,[],"[2182, 817]","(98187, 98188)"
7301597,NCT00352144,[],"[2182, 817]","(98187, 98188)"


In [9]:
# Get the conditions to merge them in
study_conditions = pd.read_sql_query("""
    SELECT * from study_conditions
""", conn)

study_conditions

Unnamed: 0,id,study,condition
0,0,NCT03266419,0
1,1,NCT03262441,1
2,2,NCT03260894,2
3,3,NCT03267940,3
4,4,NCT03267940,4
...,...,...,...
48276,48276,NCT03030989,8836
48277,48277,NCT03038880,1313
48278,48278,NCT03038880,3414
48279,48279,NCT03031496,282


In [11]:
diffs = diffs.merge(study_conditions[['study', 'condition']], on='study')

In [17]:
diffs['bases'] = diffs['bases'].astype(str)
diffs['diffs'] = diffs['diffs'].astype(str)
diffs['group_pairs'] = diffs['group_pairs'].astype(str)
diffs

Unnamed: 0,study,bases,diffs,group_pairs,condition
0,NCT03266419,['266'],[],"(35418, 35419)",0
1,NCT03266419,['266'],[],"(35418, 35419)",0
2,NCT03260894,[],"['5787', '65', ' 2722']","(35421, 35422)",2
3,NCT03260894,[],"['5787', '65', ' 2722']","(35421, 35422)",2
4,NCT03260894,[],"['5787', '65', ' 2722']","(35421, 35422)",2
...,...,...,...,...,...
11107661,NCT00352144,[],"['2182', '817']","(98187, 98188)",817
11107662,NCT00352144,[],"['2182', '817']","(98187, 98188)",817
11107663,NCT00352144,[],"['2182', '817']","(98187, 98188)",817
11107664,NCT00352144,[],"['2182', '817']","(98187, 98188)",817


In [19]:
diffs = diffs.drop_duplicates()

In [25]:
# Parse back to objects
import ast

diffs['bases'] = diffs['bases'].apply(lambda x: [int(y) for y in ast.literal_eval(x)])
diffs['diffs'] = diffs['diffs'].apply(lambda x: [int(y) for y in ast.literal_eval(x)])
diffs['group_pairs'] = diffs['group_pairs'].apply(ast.literal_eval)
diffs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diffs['bases'] = diffs['bases'].apply(lambda x: [int(y) for y in ast.literal_eval(x)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diffs['diffs'] = diffs['diffs'].apply(lambda x: [int(y) for y in ast.literal_eval(x)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diffs['group_pairs'] = diffs['g

Unnamed: 0,study,bases,diffs,group_pairs,condition
0,NCT03266419,[266],[],"(35418, 35419)",0
2,NCT03260894,[],"[5787, 65, 2722]","(35421, 35422)",2
5,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",3
6,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",4
7,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",5
...,...,...,...,...,...
11107530,NCT03038880,[],"[7424, 2182]","(98182, 98184)",3414
11107531,NCT03038880,[],"[10187, 2954, 7424, 2182]","(98183, 98184)",1313
11107532,NCT03038880,[],"[10187, 2954, 7424, 2182]","(98183, 98184)",3414
11107641,NCT03031496,"[149, 5033]",[],"(98185, 98186)",282


In [33]:
# Make the group pairs table

group_pairs = diffs['group_pairs'].drop_duplicates().apply(lambda x: pd.Series(x)).rename(columns ={
    0: 'group_a',
    1: 'group_b'
})
group_pairs

Unnamed: 0,group_a,group_b
0,35418,35419
2,35421,35422
5,35423,35424
9,35423,35425
13,35423,35426
...,...,...
11107527,98182,98183
11107529,98182,98184
11107531,98183,98184
11107641,98185,98186


In [36]:
# Make group pairs diff table
diffs[['group_a', 'group_b']] = diffs['group_pairs'].apply(lambda x: pd.Series(x)).rename(columns ={
    0: 'group_a',
    1: 'group_b'
})
diffs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diffs[['group_a', 'group_b']] = diffs['group_pairs'].apply(lambda x: pd.Series(x)).rename(columns ={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diffs[['group_a', 'group_b']] = diffs['group_pairs'].apply(lambda x: pd.Series(x)).rename(columns ={


Unnamed: 0,study,bases,diffs,group_pairs,condition,group_a,group_b
0,NCT03266419,[266],[],"(35418, 35419)",0,35418,35419
2,NCT03260894,[],"[5787, 65, 2722]","(35421, 35422)",2,35421,35422
5,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",3,35423,35424
6,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",4,35423,35424
7,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",5,35423,35424


In [38]:
# Reset group pairs id

group_pairs.insert(0, 'group_pairs_id', range(0, len(group_pairs)))

In [40]:
# Add treatment diff id

diffs.insert(0, 'diff_id', range(0, len(diffs)))

In [39]:
group_pairs.head()

Unnamed: 0,group_pairs_id,group_a,group_b
0,0,35418,35419
2,1,35421,35422
5,2,35423,35424
9,3,35423,35425
13,4,35423,35426


In [41]:
diffs.head()

Unnamed: 0,diff_id,study,bases,diffs,group_pairs,condition,group_a,group_b
0,0,NCT03266419,[266],[],"(35418, 35419)",0,35418,35419
2,1,NCT03260894,[],"[5787, 65, 2722]","(35421, 35422)",2,35421,35422
5,2,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",3,35423,35424
6,3,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",4,35423,35424
7,4,NCT03267940,"[2534, 3289]","[3282, 3922, 5967, 5968]","(35423, 35424)",5,35423,35424


In [43]:
group_pairs_diff = diffs.merge(group_pairs, on=['group_a', 'group_b'])[['diff_id', 'group_pairs_id']]
group_pairs_diff.head()

Unnamed: 0,diff_id,group_pairs_id
0,0,0
1,1,1
2,2,2
3,3,2
4,4,2


In [50]:
# Diff treatments diff

diffs_treatments_diff = diffs[['diff_id', 'diffs']].explode('diffs').dropna()

In [51]:
# baseline treatments diff

base_treatments_diff = diffs[['diff_id', 'bases']].explode('bases').dropna()

In [52]:
# true diffs

diffs = diffs[['diff_id', 'condition']]

### Upload all tables to database

In [56]:
# group_pairs

group_pairs.rename(columns = {
    'group_pairs_id': 'id',
}).to_sql('group_pairs', conn, index=False, if_exists='append')

282

In [59]:
# diffs

diffs.rename(columns={
    'diff_id': 'id'
}).to_sql('treatment_diffs', conn, index=False, if_exists='append')

982

In [61]:
diffs_treatments_diff.insert(0, 'id', range(0, len(diffs_treatments_diff)))

In [64]:
diffs_treatments_diff.rename(columns = {
    'diff_id': 'treatment_diff',
    'diffs': 'treatment'
}).to_sql('diff_treatments_diffs', conn, index=False, if_exists='append')

693

In [66]:
base_treatments_diff.insert(0, 'id', range(0, len(base_treatments_diff)))

In [70]:
base_treatments_diff.rename(columns ={
    'diff_id': 'treatment_diff',
    'bases': 'treatment'
}).to_sql('base_treatments_diffs', conn, index=False, if_exists='append')

267

In [79]:
# group pairs diffs

group_pairs_diff.insert(0, 'id', range(0, len(group_pairs_diff)))

In [82]:
group_pairs_diff.rename(columns ={
    'diff_id': 'treatment_diff',
    'group_pairs_id': 'group_pair'
}).to_sql('group_pair_diffs', conn, index=False, if_exists='append')

982