1. Import Modules required

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from qmplot import manhattanplot
import re
from numpy.linalg import inv
from pathlib import Path 
plt.rcdefaults()

In [None]:
df_GCN = pd.read_table("../dataset/LocalDatasets/Stinson_GCN_hg19.tsv", low_memory=False)

2. Reading all data sets.

In [None]:
df_fasting_glucose = pd.read_table("../dataset/FastingGlucose/34059833-GCST90002232-EFO_0004468-Build37.f.tsv", low_memory=False)
df_fasting_insulin = pd.read_table("../dataset/FastingInsulin/34059833-GCST90002238-EFO_0004467-Build37.f.tsv", low_memory=False)
df_HbA1c = pd.read_table("../dataset/HbA1c/34059833-GCST90002244-EFO_0004541-Build37.f.tsv", low_memory=False)
df_BMI = pd.read_table("../dataset/LocalDatasets/Loh_BodyMassIndex_hg19.tsv", low_memory=False)
df_T2D = pd.read_table("../dataset/LocalDatasets/Mahajan_AST_hg19.tsv", low_memory=False)
df_GCN = pd.read_table("../dataset/LocalDatasets/Stinson_GCN_hg19.tsv", low_memory=False)

3. Reformatting BMI because it's not harmonised like the rest.

In [None]:
df_BMI = df_BMI.rename(columns={"chr": "chromosome", "pos": "base_pair_location", 
                                        "Allele1": "effect_allele", "Allele2": "other_allele",
                                        "Freq1": "effect_allele_frequency", "Effect": "beta",
                                        "P-value": "p_value",
                                        "StdErr": "standard_error"})

In [None]:
df_BMI

In [None]:
df_BMI['effect_allele'].value_counts()

In [None]:
df_BMI.loc[df_BMI.effect_allele == 'a', 'effect_allele'] = 'A'

In [None]:
df_BMI.loc[df_BMI.effect_allele == 't', 'effect_allele'] = 'T'

In [None]:
df_BMI.loc[df_BMI.effect_allele == 'c', 'effect_allele'] = 'C'

In [None]:
df_BMI

In [None]:
df_BMI['other_allele'].value_counts()

In [None]:
df_BMI.loc[df_BMI.other_allele == 'g', 'other_allele'] = 'G'

In [None]:
df_BMI.loc[df_BMI.other_allele == 'c', 'other_allele'] = 'C'

In [None]:
df_BMI.loc[df_BMI.other_allele == 't', 'other_allele'] = 'T'

In [None]:
df_T2D

In [None]:
df_T2D = df_T2D.rename(columns={"CHROM": "chromosome", "POS": "base_pair_location", 
                                        "EA": "effect_allele", "OA": "other_allele",
                                        "EAF": "effect_allele_frequency", "ES": "beta",
                                        "P": "p_value",
                                        "SE": "standard_error"})

In [None]:
df_GCN

4. Reformatting Dataframe

In [None]:
def reformat_df(df):
    df.loc[df["chromosome"] == "X", 'chromosome'] = 23
    df.loc[df["chromosome"] == "XY", 'chromosome'] = 24
    df.loc[df["chromosome"] == "MT", 'chromosome'] = 25
    df.loc[df["chromosome"] == "Y", 'chromosome'] = 26

    df = df.astype({'chromosome':'int'})
    
    df = df.dropna(subset=['chromosome', 'p_value', 'effect_allele', 'other_allele', 'beta', 'p_value', 'standard_error'])
    df = df.drop_duplicates(subset=['chromosome', 'base_pair_location', 'effect_allele', 'other_allele'], keep='last')
    df = df.sort_values(by = ['chromosome']).reset_index()
    return df

In [None]:
df_T2D

In [None]:
df_GCN

In [None]:
df_GCN = df_GCN.rename(columns={"CHROM": "chromosome", "POS": "base_pair_location", 
                                        "EA": "effect_allele", "OA": "other_allele",
                                        "EAF": "effect_allele_frequency", "ES": "beta",
                                        "P": "p_value",
                                        "SE": "standard_error"})

In [None]:
df_GCN

In [None]:
df_T2D

In [None]:
df_fasting_insulin

In [None]:
df_GCN = reformat_df(df_GCN)

In [None]:
#Removed duplicates
df_fasting_insulin = reformat_df(df_fasting_insulin)
df_fasting_glucose = reformat_df(df_fasting_glucose)
df_HbA1c = reformat_df(df_HbA1c)
df_BMI = reformat_df(df_BMI)
df_T2D = reformat_df(df_T2D)
df_GCN = reformat_df(df_GCN)

In [None]:
df_fasting_insulin.shape

In [None]:
count_fi = df_fasting_insulin.shape[0]
count_fg = df_fasting_glucose.shape[0]
count_hba1c = df_HbA1c.shape[0]
count_bmi = df_BMI.shape[0]
count_t2d = df_T2D.shape[0]
count_gcn = df_GCN.shape[0]

In [None]:
def calculate_z(df):
    df['z_score'] = df['beta']/df['standard_error']
    return df

def sub_set(df, chromosome):
    #sub table
    df =  df[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele', 'effect_allele_frequency', 'beta','standard_error', 'p_value']]
    df= df[df.chromosome == chromosome]
    return df
    
def merge_(df1, df2, df1z_short = '', df2z_short = ''):
    df_sub = df1.merge(df2, on=['chromosome','base_pair_location', 'effect_allele', 
                                            'other_allele'], how = 'outer').sort_index(axis=1)
    
    if (len(df1z_short) > 0) & (len(df2z_short) > 0):
        z_score_x = f"{df1z_short}_z"
        z_score_y = f"{df2z_short}_z"
        beta_x = f"{df1z_short}_beta"
        beta_y = f"{df2z_short}_beta"
        p_value_x = f"{df1z_short}_p_value"
        p_value_y = f"{df2z_short}_p_value"
        standard_error_x = f"{df1z_short}_standard_error"
        standard_error_y = f"{df2z_short}_standard_error"
        df_sub = df_sub.rename(columns={"z_score_x": z_score_x, "z_score_y": z_score_y, 
                                        "beta_x": beta_x, "beta_y": beta_y,
                                        "p_value_x": p_value_x, "p_value_y": p_value_y,
                                        "standard_error_x": standard_error_x,
                                        "standard_error_y": standard_error_y})
    return df_sub

def chromosome_(df, chrom):
    df_chrm = sub_set(df, chrom)
    df_chrm = calculate_z(df_chrm)
    return df_chrm
    

#all 6 traits   
def get_chromosome(chrm):
    df_fasting_insulin_chrm = chromosome_(df_fasting_insulin, chrm)
    df_fasting_glucose_chrm = chromosome_(df_fasting_glucose, chrm)
    df_HbA1c_chrm = chromosome_(df_HbA1c, chrm)
    df_BMI_chrm = chromosome_(df_BMI, chrm)
    df_T2D_chrm = chromosome_(df_T2D, chrm)
    df_GCN_chrm = chromosome_(df_GCN, chrm)
    df_fasting_insulin_chrm = df_fasting_insulin_chrm[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele', 'effect_allele_frequency', 'z_score']]
    df_fasting_glucose_chrm = df_fasting_glucose_chrm[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele','effect_allele_frequency', 'z_score']]
    df_HbA1c_chrm = df_HbA1c_chrm[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele', 'effect_allele_frequency','z_score']]
    df_BMI_chrm = df_BMI_chrm[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele', 'effect_allele_frequency','z_score']]
    df_T2D_chrm = df_T2D_chrm[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele', 'effect_allele_frequency','z_score']]
    df_GCN_chrm = df_GCN_chrm[['chromosome', 'base_pair_location', 'effect_allele', 
                                            'other_allele', 'effect_allele_frequency','z_score']]
    df_1 = merge_(df_fasting_insulin_chrm, df_fasting_glucose_chrm, 'fi', 'fg')
    df_2 = merge_(df_HbA1c_chrm, df_BMI_chrm, 'hba1c', 'bmi')
    df_3 = merge_(df_T2D_chrm, df_GCN_chrm, 't2d', 'gcn')
    
    df_4 = merge_(df_1, df_2)
    df = merge_(df_3, df_4)
    
    return df

In [None]:
df_candidate = get_chromosome(1)

In [None]:
df_candidate

In [None]:
df_candidate = df_candidate[['chromosome','base_pair_location', 'effect_allele', 
                                            'other_allele', 'fi_z', 'fg_z', 'hba1c_z', 't2d_z', 'bmi_z', 'gcn_z']]

In [None]:
df_candidate

In [None]:
df_candidate[df_candidate.chromosome == 1]

Use pd.concat instead below
pd.concat([df7, new_row.to_frame().T], ignore_index=True)

In [None]:
for i in range (2, 27):
    df = get_chromosome(i)
    dft = df[['chromosome', 'base_pair_location','effect_allele', 'other_allele', 
             'fi_z', 'fg_z', 'hba1c_z', 't2d_z','bmi_z', 'gcn_z']]
    df_candidate = df_candidate.append(dft)

df_candidate

In [None]:
df_candidate.dropna(subset=["t2d_z", "bmi_z"])

In [None]:
df_BMI.shape

In [None]:
filepath = Path('../output_csv/df_candidate_1.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True) 
#save progress after omni p values
df_candidate.to_csv(filepath, index=False, sep = ';') 

In [None]:
df_candidate[df_candidate.duplicated(subset=['chromosome', 'base_pair_location'])]

In [None]:
df_duplicates = df_candidate[df_candidate.duplicated(subset=['chromosome', 'base_pair_location'], keep = False)]

In [None]:
df_duplicates

In [None]:
df_duplicate_snps = df_duplicates[df_duplicates.duplicated(subset=['chromosome', 'base_pair_location'], keep='last')]

In [None]:
df_duplicate_snps

In [None]:
df_duplicate_snps.index

In [None]:
df_compare_snps = df_duplicates.drop(df_duplicate_snps.index)

In [None]:
df_compare_snps

In [None]:
df_combine = df_duplicate_snps.merge(df_compare_snps, on=['chromosome','base_pair_location'], how = 'outer').sort_index(axis=1)

In [None]:
df_combine   

In [None]:
df_combine[(df_combine.effect_allele_x == df_combine.other_allele_y)]

In [None]:
df_combine[(df_combine.effect_allele_x == df_combine.other_allele_y)& (df_combine.other_allele_x == df_combine.effect_allele_y)]

In [None]:
df_candidate

In [None]:
df_candidate = df_candidate[['chromosome', 'base_pair_location','effect_allele', 'other_allele', 
             'fi_z', 'fg_z', 'hba1c_z', 't2d_z', 'bmi_z', 'gcn_z']]

In [None]:
df_candidate

In [None]:
filepath = Path('../output_csv/df_candidate.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True) 

In [None]:
#save progress after omni p values
df_candidate.to_csv(filepath, index=False, sep = ';') 

In [None]:
df_candidate.dropna().shape[0]

In [None]:
count_all = df_candidate.dropna().shape[0]

In [None]:
colors = plt.cm.PuBuGn(np.linspace(0, 0.7, 10))

colors_red = plt.cm.YlOrRd(np.linspace(0, 0.7, 10))

objects = ('FI', 'FG', 'HbA1c', 'T2D', 'BMI', 'GCN', 'Common')
y_pos = np.arange(len(objects))
performance = [count_fi,count_fg,count_hba1c,count_t2d,count_bmi,count_gcn, count_all]

plt.bar(y_pos, performance, align='center', color = colors[9])
plt.xticks(y_pos, objects)
plt.ylabel('Number of snps')
plt.xlabel('GWAS Study')
plt.title('Number of SNPs from each GWAS study', pad=20)
plt.savefig('../output_figs/fig1_num_snps.png')
plt.show()

In [None]:
def format_plotdf(df):
    df = df.rename(columns={"chromosome": "#CHROM", "base_pair_location": "POS", "p_value": "P"})
    df.dropna(subset=['#CHROM', 'P'])
    
    df = df.sort_values(by = ['#CHROM', "POS"]).reset_index()

    return df

In [None]:
def plot_manhatten_plots(df, figname):
    # generate manhattan plot and set an output file.
    colors_red = plt.cm.YlOrRd(np.linspace(0, 0.7, 10))
    xtick = set(['chr' + i for i in list(map(str, range(1, 10))) + ['11', '13', '15', '18', '21', '22', 'X', 'XY', 'MT']])
    f, ax = plt.subplots(figsize=(12, 4), facecolor='w', edgecolor='k')
    manhattanplot(data=df,
                  marker=".",
                  sign_marker_p=5e-8,  # Genome wide significant p-value
                  sign_marker_color=colors[7],
                  snp="POS",

                  title=figname,
                  xlabel="Chromosome",
                  ylabel=r"$-log_{10}{(P)}$",

                  sign_line_cols=["#D62728", "#2CA02C"],
                  hline_kws={"linestyle": "--", "lw": 1.3},
                  ld_block_size=500000,  # 500000 bp
                  text_kws={"fontsize": 12,  # The fontsize of annotate text
                            "arrowprops": dict(arrowstyle="-", color="k", alpha=0.6)},
                  ax=ax)
    plt.title(figname)
    plt.savefig(f'../output_figs/{re.sub(r"[^a-zA-Z0-9]","",figname)}.jpg', dpi=400, bbox_inches='tight')
    plt.show()

In [None]:
df_plot_fasting_insulin = format_plotdf(df_fasting_insulin[df_fasting_insulin.p_value < 1e-4])

In [None]:
df_fasting_insulin

In [None]:
df_plot_fasting_insulin

In [None]:
def formatdf_plot(df):
    df = df.rename(columns={"chromosome": "CHR", "base_pair_location": "BP" ,"p_value": "P"})
    df.dropna(subset=['CHR', 'P'])
    
    df["SNP"] = df["BP"]
    df["GENE"] = "Not Given"
    df["DISTANCE"] = "Not Given" 
    
    df = df.sort_values(by = ['CHR', "BP"]).reset_index().drop(columns='index')

    return df

In [None]:
df_fasting_insulin_plot = formatdf_plot(df_fasting_insulin)

In [None]:
df_fasting_insulin_plot

In [None]:
import dash_bio
dash_bio.ManhattanPlot(
    dataframe=df_fasting_insulin_plot[df_fasting_insulin_plot.P < 1e-5].reset_index().drop(columns="index"),
    highlight_color='#00FFAA',
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)

In [None]:
df_fasting_glucose_plot = formatdf_plot(df_fasting_glucose)
dash_bio.ManhattanPlot(
    dataframe=df_fasting_glucose_plot[df_fasting_glucose_plot.P < 1e-5].reset_index().drop(columns="index"),
    highlight_color='#00FFAA',
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)

In [None]:
df_HbA1c_plot = formatdf_plot(df_HbA1c)
dash_bio.ManhattanPlot(
    dataframe=df_HbA1c_plot[df_HbA1c_plot.P < 1e-5].reset_index().drop(columns="index"),
    highlight_color='#00FFAA',
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)

In [None]:
df_T2D_plot = formatdf_plot(df_T2D)
dash_bio.ManhattanPlot(
    dataframe=df_T2D_plot[df_T2D_plot.P < 1e-5].reset_index().drop(columns="index"),
    highlight_color='#00FFAA',
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)

In [None]:
df_BMI_plot = formatdf_plot(df_BMI)
dash_bio.ManhattanPlot(
    dataframe=df_BMI_plot[df_BMI_plot.P < 1e-5].reset_index().drop(columns="index"),
    highlight_color='#00FFAA',
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)

In [None]:
df_GCN_plot = formatdf_plot(df_GCN)
dash_bio.ManhattanPlot(
    dataframe=df_GCN_plot[df_GCN_plot.P < 1e-5].reset_index().drop(columns="index"),
    highlight_color='#00FFAA',
    suggestiveline_color='#AA00AA',
    genomewideline_color='#AA5500'
)

In [None]:
def significant_loci_counts(df):
    list_chromosomes = []
    for chromosome in set(df.chromosome):
        list_loci = []
        bp_pos = []
        df_chromosome = df[df.chromosome == chromosome]
        df_chromosome = df_chromosome.sort_values(by = ['base_pair_location']).reset_index()
        df_chromosome = df_chromosome.drop(['index'], axis=1)
        df_chromosome.reset_index(inplace=True, drop=True); 
        df_significant_chromosome = df_chromosome[df_chromosome['p_value'] < 5e-8]
        if(len(df_significant_chromosome) > 0):
            first_bp = df_significant_chromosome.base_pair_location.to_list()[0]
            counter = 1
            for bp in df_significant_chromosome.base_pair_location:
                if first_bp == 0:
                    bp_pos = []
                    bp_pos.append(first_bp)
                    first_bp = bp
                    counter = 0
                elif(bp - first_bp < 1000000):
                    counter+= 1
                    bp_pos.append(bp)
                    continue
                else:
                   #if(counter > 10): 
                   list_loci.append({"start_pos": first_bp, "end_pos": bp_pos[len(bp_pos) - 1], "bp_pos":bp_pos, "sig_count": counter})
                   first_bp = 0
            list_chromosomes.append({'chromosome': chromosome, 'list_significant_loci': list_loci})
    x = []
    y = []
    for row in list_chromosomes:
        chrom = row['chromosome']
        chrom_int = 0
        if(chrom == "X"):
            chrom_int = 23
        elif chrom == "XY": 
            chrom_int = 24
        elif chrom == "MT":
            chrom_int = 25
        elif chrom == "Y":
            chrom_int = 26
        else:
            chrom_int = chrom
        #we can plot with only int values in bar chart hence encoded to int values
        x.append(chrom_int)
        y.append(len(row['list_significant_loci']))

    total_sig = sum(y)
    return list_chromosomes, total_sig

In [None]:
hba1c_sig_loci, hba1c_total_sig = significant_loci_counts(df_HbA1c)

In [None]:
hba1c_total_sig