In [4]:
import pandas as pd
import gzip

In [22]:
def prep_IBD_PC_for_plink(filepath):
    """
    Function to prepare IBD (Inflammatory bowel disease) and PC (Prostate cancer) files
    Only loads in the usefull columns
    renames all columns so Plink will recognize them
    
    """
    col_list = ["chromosome", "base_pair_location", "variant_id", "beta", "standard_error", "p_value"]
    if filepath.rsplit(".", 1)[1] == "gz":
        with gzip.open(filepath, "rb") as f:
            data = pd.read_csv(f, sep="\s+", usecols=col_list)
    else:
        data = pd.read_csv(filepath, sep="\s+", usecols=col_list)
    
    data = data.rename(columns={"chromosome":"CHR", "base_pair_location":"POS", "variant_id":"SNP", "beta":"BETA", "standard_error":"SE", "p_value":"P"})
    file = filepath.split(".", 1)[0]
    data.to_csv(file + "_prepped.txt", sep="\t", index=False)

In [15]:
IBD = "C:/Users/Pin/Desktop/Execute_methods/editsnpfiles/28067908-GCST004131-EFO_0003767.h.tsv.gz"
PC = "C:/Users/Pin/Desktop/Execute_methods/editsnpfiles/29892016-GCST006085-EFO_0001663.h.tsv.gz"

In [None]:
prep_IBD_PC_for_plink(IBD)
prep_IBD_PC_for_plink(PC)

IBD and PrC files are ran through Plink with the following flags:

plink
--bfile 1000G/1000G.EUR
--clump filepath
--clump-p1 5e-6
--clump-r2 0.10
--clump-kb 1000
--out (file_name)_5e8_1000kb_r2_01

Height file is ran through Plink with the following flags:

plink
--bfile 1000G/1000G.EUR
--clump filepath
--clump-p1 5e-8
--clump-r2 0.10
--clump-kb 1000
--out (file_name)_5e8_1000kb_r2_01

In [2]:
def prep_for_depict(filepath):
    """
    Removes all columns except SNP. this is needed for Depict
    run for IBD, PC, and Height file
    """
    data = pd.read_csv(filepath, sep="\s+")
    data = data["SNP"]
    file = filepath.split(".", 1)[0]
    data.to_csv(file + "_prepped4depict.txt", sep="\t", index=False, header=False)

In [38]:
def resample_Height(filepath):
    """
    The Height SNP file is too large for Depict.
    Therefor it was found it could not have more than 200 SNP's
    Takes 200 random SNP's from Height file to use in Depict and saves this to Height_200.txt
    """
    data = pd.read_csv(filepath, sep="\t")
    data = data.sample(n=200, axis=0)
    path = filepath.rsplit("/", 1)[0]
    if path == filepath:
        path = ""
    data.to_csv(path + "/Height_200.txt", sep="\t", index=False, header=False)

In [1]:
IBD2 = "IBD_5e6_1000kb_r2_01.clumped"
PC2 = "PC_5e6_1000kb_r2_01.clumped"
Height = "Height_5e8_1000kb_r2_01.clumped"

In [5]:
prep_for_depict(IBD2)
prep_for_depict(PC2)
prep_for_depict(Height)

In [19]:
Height2 = "Height_5e8_1000kb_r2_01_prepped4depict.txt"

In [None]:
resample_Height(Height2)