In [46]:
import pandas as pd
import numpy as np

from keras.models import Sequential, load_model
from scipy.stats import pearsonr, spearmanr

from scipy.special import logit


import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib
rcParams.update({'figure.autolayout': True})
rcParams['svg.fonttype'] = 'none'
rcParams['font.size']=10


import py2bit

In [47]:
snp_list_and_info = pd.read_csv("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/common_variants/snigdha_snp_list_unique_haploreg_hg19_positions.txt",
                                sep='\t'
                               )

In [48]:
leftWindow = 499
rightWindow = 500

In [49]:
snp_list_and_info.head()

Unnamed: 0,CHR,haploreg_hg19_pos,LD_RSID,REF,ALT
0,chr10,11707563,rs34388456,GCCT,G
1,chr10,11707915,rs11257227,A,G
2,chr10,11714507,rs74347557,C,T
3,chr10,11714686,rs77892763,C,T
4,chr10,11717397,rs11257238,T,C


In [50]:
def oneHotEncodeSequence(sequence):
    oneHotDimension = (len(sequence), 4)
    dnaAlphabet = {"A":0, "G":1, "C":2, "T":3}    
    one_hot_encoded_sequence = np.zeros(oneHotDimension, dtype=np.int)
    for i, nucleotide in enumerate(sequence):
        if nucleotide.upper() in dnaAlphabet:
            index = dnaAlphabet[nucleotide.upper()]
            one_hot_encoded_sequence[i][index] = 1
    return one_hot_encoded_sequence


def getUpdatedPaddings(allele,left,right):
    alleleLength = len(allele)
    deductable = "right"
    for i in range(alleleLength-1):
        if deductable=="right":
            right-=1
            deductable="left"
        elif deductable=="left":
            left-=1
            deductable="right"

    return left,right

In [51]:
genome_object = py2bit.open("/home/eramamur/resources/genomes/hg19/hg19.2bit")

In [52]:
ref_sequences_list = []
alt_sequences_list = []

In [53]:
for i,row in snp_list_and_info.iterrows():
    referenceLeftWindow, referenceRightWindow = getUpdatedPaddings(row["REF"], leftWindow, rightWindow)
    alternateLeftWindow, alternateRightWindow = getUpdatedPaddings(row["ALT"], leftWindow, rightWindow)

    referenceLeftSequence = genome_object.sequence(row["CHR"],
                                                   row["haploreg_hg19_pos"]-referenceLeftWindow,
                                                   row["haploreg_hg19_pos"]
                                                  )
    referenceRightSequence = genome_object.sequence(row["CHR"],
                                                    row["haploreg_hg19_pos"]+len(row["REF"]),
                                                    row["haploreg_hg19_pos"]+len(row["REF"])+referenceRightWindow
                                                   )
    referenceSequence = referenceLeftSequence.lower() + row["REF"].upper() + referenceRightSequence.lower()

    alternateLeftSequence = genome_object.sequence(row["CHR"],
                                                   row["haploreg_hg19_pos"]-alternateLeftWindow,
                                                   row["haploreg_hg19_pos"]
                                                  )
    alternateRightSequence = genome_object.sequence(row["CHR"],
                                                    row["haploreg_hg19_pos"]+len(row["REF"]),
                                                    row["haploreg_hg19_pos"]+len(row["REF"])+alternateRightWindow
                                                   )
    alternateSequence = alternateLeftSequence.lower() + row["ALT"].upper() + alternateRightSequence.lower()
    ref_sequences_list.append(referenceSequence)
    alt_sequences_list.append(alternateSequence)

In [33]:
snp_list_and_info["REF_sequence_1kb"] = ref_sequences_list
snp_list_and_info["ALT_sequence_1kb"] = alt_sequences_list

In [42]:
indel_info = snp_list_and_info.loc[(snp_list_and_info["REF"].str.len()>1) | (snp_list_and_info["ALT"].str.len()>1)]

In [43]:
for i,row in indel_info.iterrows():
    print(row)

CHR                                 chr10
haploreg_hg19_pos                11707563
LD_RSID                        rs34388456
REF                                  GCCT
ALT                                     G
REF_sequence_1kb     atgcccggGCCTctcctcct
ALT_sequence_1kb     catgcccggGctcctcctcc
Name: 0, dtype: object
CHR                                 chr11
haploreg_hg19_pos                59874475
LD_RSID                       rs143391047
REF                                  TTTG
ALT                                     T
REF_sequence_1kb     ccttttttTTTGtgttgttg
ALT_sequence_1kb     accttttttTtgttgttgtt
Name: 38, dtype: object
CHR                                 chr11
haploreg_hg19_pos                59874817
LD_RSID                       rs143028487
REF                                     T
ALT                               TTTTCTC
REF_sequence_1kb     gcctctcttTttctctgtct
ALT_sequence_1kb     tctcttTTTTCTCttctctg
Name: 39, dtype: object
CHR                                 chr11
haplo