# Work with iHS data from  Johnson et al., 2018 

[Link](https://www.nature.com/articles/s41559-018-0478-6)

In [2]:
import pandas as pd
import numpy as np
import os

##### Step 1. Prepairing raw data

In [None]:
ihs_files = os.listdir("D:\\iHS")

daf_cols_names = [
    "daf_ESN",
    "daf_GWD",
    "daf_LWK",
    "daf_MSL",
    "daf_YRI",
    "daf_ACB",
    "daf_ASW",
    "daf_CLM",
    "daf_MXL",
    "daf_PEL",
    "daf_PUR",
    "daf_CDX",
    "daf_CHB",
    "daf_CHS",
    "daf_JPT",
    "daf_KHV",
    "daf_CEU",
    "daf_FIN",
    "daf_GBR",
    "daf_IBS",
    "daf_TSI",
    "daf_BEB",
    "daf_GIH",
    "daf_ITU",
    "daf_PJL",
    "daf_STU",
]

stdIHS_cols_names = [
    "stdIHS_ESN",
    "stdIHS_GWD",
    "stdIHS_LWK",
    "stdIHS_MSL",
    "stdIHS_YRI",
    "stdIHS_ACB",
    "stdIHS_ASW",
    "stdIHS_CLM",
    "stdIHS_MXL",
    "stdIHS_PEL",
    "stdIHS_PUR",
    "stdIHS_CDX",
    "stdIHS_CHB",
    "stdIHS_CHS",
    "stdIHS_JPT",
    "stdIHS_KHV",
    "stdIHS_CEU",
    "stdIHS_FIN",
    "stdIHS_GBR",
    "stdIHS_IBS",
    "stdIHS_TSI",
    "stdIHS_BEB",
    "stdIHS_GIH",
    "stdIHS_ITU",
    "stdIHS_PJL",
    "stdIHS_STU",
]

ihs_files = os.listdir("D:\\iHS")
out_folder = "D:\\iHS\\iHS_max_for_GBR_population"
os.makedirs(out_folder, exist_ok=True)

for file in ihs_files:
    file_path = os.path.join("D:\\iHS", file)
    df = pd.read_csv(file_path, sep="\t", comment="#", low_memory=False)

    # DAF
    daf = df["DAF"].str.split("|", expand=True)
    daf = daf.set_axis(daf_cols_names, axis=1)
    daf.replace({"NA": np.nan}, inplace=True)
    daf = daf.astype(dtype="float64")
    daf_gbr = pd.DataFrame(daf['daf_GBR'])

    # stdiHS
    stdIHS = df["stdIHS"].str.split("|", expand=True)
    stdIHS = stdIHS.set_axis(stdIHS_cols_names, axis=1)
    stdIHS.replace({"NA": np.nan}, inplace=True)
    stdIHS = stdIHS.astype(dtype="float64")
    stdihs_gbr = pd.DataFrame(stdIHS['stdIHS_GBR'])

    # concat
    iHS_data = pd.concat(
        [df[["CHR", "POS", "RSNUM"]], daf_gbr, stdihs_gbr], axis=1)

    # output filename
    base_name = os.path.splitext(file)[0]
    prefix = base_name.split(".")[0]
    output = os.path.join(out_folder, f"{prefix}_iHS_GBR.csv")

    iHS_data.to_csv(output, index=False)

##### Step 2. Make .bed file with data

In [14]:
ihs_files = os.listdir("D:\\iHS\\iHS_max_for_GBR_population")
out_folder = "D:\\iHS\\iHS_max_GBR_formatted_bed"
os.makedirs(out_folder, exist_ok=True)

for file in ihs_files:
    file_path = os.path.join("D:\\iHS\\iHS_max_for_GBR_population", file)
    df = pd.read_csv(file_path, sep=',', low_memory=False)
    df['chr'] = 'chr' + df['CHR'].astype(str)
    df['start'] = df['POS'] - 1
    df['end'] = df['POS']
    df['stdIHS_GBR'] = df['stdIHS_GBR'].fillna(0)
    final_data = df[['chr', 'start', 'end', 'stdIHS_GBR']]
    
    base_name = os.path.splitext(file)[0]
    prefix = base_name.split(".")[0]
    output = os.path.join(out_folder, f"{prefix}_stdiHS_metrics_GBR.bed")
    final_data.to_csv(output, index=False, sep='\t', header=False)