In [15]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Get All Possible Mutations
Using the matrix data, grab all possible mutations

In [16]:
matrix_df = pd.read_csv('/Users/nathanielblack/Downloads/MatrixGenomDt_Mt_T.csv', sep ='\t', index_col=False)
all_mutations = [mut for mut in matrix_df.columns if mut != 'Mut']

#Get the IDs We Use in Viz2
- Load the data used in Viz2
- Pivot the data so we have a column for each *Sample* with the *Clinical_Significance* as the value

In [17]:
genomic_df = pd.read_csv('/Users/nathanielblack/Downloads/genomicDtMt.csv')

In [18]:
all_mutations = pd.DataFrame(all_mutations, columns=['Mut'])

In [19]:
severity_map = {0:0,1:0,2:0,3:0,4:-1,5:-2}
genomic_df['Severity'] = genomic_df['Clinical_Significance'].map(severity_map)
people_columns = genomic_df.pivot_table(index = 'Mut', columns='Sample', values='Severity')
people_columns = people_columns.reset_index()

#Combine All Mutations and Viz2 Data
- Left join the viz2 data to the all mutation data
- Replace NA with 0

Result will be a matrix of mutation by person with severity as the values

In [20]:
mut_by_person_df = all_mutations.merge(people_columns, how='left', on='Mut')
mut_by_person_df.fillna(0, inplace=True)
mut_by_person_df.set_index('Mut', inplace=True)

#Add Population Info
- create an ID to population map
- Append the population info to the mutation by person matrix

In [21]:
population_info = genomic_df[['Sample','Population']].drop_duplicates()
pop_df = mut_by_person_df.T
pop_df['Sample'] = pop_df.index
pop_df = pop_df.merge(population_info, how='left', on='Sample')
pop_df.set_index(['Population','Sample'], inplace=True)

#Heatmap
We now have a dataframe that can be used as a heatmap

In [22]:
#for pop in genomic_df.Population.unique().tolist()[:-1]:
#    print "*"*20 + pop + "*"*20
#    ax = sns.heatmap(pop_df.ix[pop])
#    ax.xaxis.set_visible(False)
#    ax.yaxis.set_visible(False)
#    plt.ylabel('Person')
#    plt.xlabel('Mutation')
#    plt.show();

In [23]:
pop_df.head()

Unnamed: 0_level_0,Mut,chrMT-827-G,chrMT-951-A,chrMT-961-CG,chrMT-980-C,chrMT-990-C,chrMT-1005-C,chrMT-1007-A,chrMT-1008-G,chrMT-1018-A,chrMT-1027-G,...,chrMT-15553-A,chrMT-15607-G,chrMT-15637-T,chrMT-15649-G,chrMT-15670-C,chrMT-15682-G,chrMT-15758-G,chrMT-15784-C,chrMT-15812-A,chrMT-15927-A
Population,Sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
GBR,HG00096,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GBR,HG00097,0,0,0,0,0,0,0,0,0,0,...,0,-1,0,0,0,0,0,0,0,0
GBR,HG00099,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
GBR,HG00100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
GBR,HG00101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
pop_df.reset_index(inplace=True)
melt_vars = [col for col in pop_df.columns if col not in ['Population', 'Sample']]
pop_df = pd.melt(pop_df, id_vars=['Population','Sample'], value_vars = melt_vars, value_name = 'Severity')

In [33]:
out = pop_df.copy(deep=True)
out.sort(['Population'])
out['col'] = pd.Categorical(out.Mut).codes

In [34]:
out.head()

Unnamed: 0,Population,Sample,Mut,Severity,col
0,GBR,HG00096,chrMT-827-G,0,102
1,GBR,HG00097,chrMT-827-G,0,102
2,GBR,HG00099,chrMT-827-G,0,102
3,GBR,HG00100,chrMT-827-G,0,102
4,GBR,HG00101,chrMT-827-G,0,102


In [38]:
df = out.copy(deep=True)
df.sort('Population', inplace=True)
df['row'] = pd.Categorical(df.Sample).codes
print df.row.max()
df = df[df.Severity != 0]
df.sort(['Population','row','col'], inplace=True)
df.to_csv('/Users/nathanielblack/Dropbox/d3/heatmaps/overall_heatmap.csv', index=False)

1271


In [30]:
genomic_df.Population.unique().tolist()[:-1]

['GBR',
 'FIN',
 'CHS',
 'PUR',
 'CDX',
 'CLM',
 'IBS',
 'PEL',
 'PJL',
 'KHV',
 'ACB',
 'GWD',
 'ESN',
 'BEB',
 'MSL']

In [40]:
genomic_df[genomic_df.Mut == 'chrMT-10398-G']

Unnamed: 0.1,Unnamed: 0,CHROM_x,POS_x,REF_x,ALT_x,INFO_x,Sample,value,Mut,CHROM_y,...,Family ID,Population,Population Description,Gender,SnpDB,RSnum,Clinical_Significance,Disease,gene,Severity
20,20,MT,10398,A,G,VT=S;AC=1397,HG00101,1,chrMT-10398-G,MT,...,HG00101,GBR,British in England and Scotland,male,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
27,27,MT,10398,A,G,VT=S;AC=1397,HG00103,1,chrMT-10398-G,MT,...,HG00103,GBR,British in England and Scotland,male,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
35,35,MT,10398,A,G,VT=S;AC=1397,HG00106,1,chrMT-10398-G,MT,...,HG00106,GBR,British in England and Scotland,female,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
44,44,MT,10398,A,G,VT=S;AC=1397,HG00107,1,chrMT-10398-G,MT,...,HG00107,GBR,British in England and Scotland,male,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
79,79,MT,10398,A,G,VT=S;AC=1397,HG00117,1,chrMT-10398-G,MT,...,HG00117,GBR,British in England and Scotland,male,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
94,94,MT,10398,A,G,VT=S;AC=1397,HG00120,1,chrMT-10398-G,MT,...,HG00120,GBR,British in England and Scotland,female,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
97,97,MT,10398,A,G,VT=S;AC=1397,HG00121,1,chrMT-10398-G,MT,...,HG00121,GBR,British in England and Scotland,female,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
104,104,MT,10398,A,G,VT=S;AC=1397,HG00124,1,chrMT-10398-G,MT,...,HG00124,GBR,British in England and Scotland,female,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
163,163,MT,10398,A,G,VT=S;AC=1397,HG00154,1,chrMT-10398-G,MT,...,HG00154,GBR,British in England and Scotland,female,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
184,184,MT,10398,A,G,VT=S;AC=1397,HG00177,1,chrMT-10398-G,MT,...,HG00177,FIN,Finnish in Finland,female,http://www.ncbi.nlm.nih.gov/snp/2853826,2853826,0,Parkinson_disease\x2c_resistance_to,ND3,0
