In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
os.chdir("/work/users/minhnth/projects/GIP/")

In [3]:
# Read a vcf file
def read_vcf(vcf_path):
    with open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                  vcf_names = [x for x in line.split('\t')]
                  break
    ifile.close()
    data = pd.read_csv(vcf_path, comment='#', sep="\s+", header=None, names=vcf_names)
    return data
def get_data(X, miss_rate):
    # Parameters
    no, dim = X.shape

    # Introduce missing data
    data_m = binary_sampler(1-miss_rate, no, dim)
    miss_data_x = X.copy()
    miss_data_x[data_m == 0] = ".|."
    return X, miss_data_x, data_m
def binary_sampler(p, rows, cols):
  '''Sample binary random variables.
  
  Args:
    - p: probability of 1
    - rows: the number of rows
    - cols: the number of columns
    
  Returns:
    - binary_random_matrix: generated binary random matrix.
  '''
  np.random.seed(7)
  unif_random_matrix = np.random.uniform(0., 1., size = [rows, cols])
  binary_random_matrix = 1*(unif_random_matrix < p)
  return binary_random_matrix

def save_vcf(data, output_VCF, header="""##fileformat=VCFv4.1\n"""):
  with open(output_VCF, 'w') as vcf:
    vcf.write(header)
  data.to_csv(output_VCF, sep="\t", mode='a', index=False)

def save_hap(data, output_path):
   cols = list(range(data.shape[1]))
   df = pd.DataFrame(data = data, columns = cols)
   df.to_csv(output_path, header = False, index = False, sep ="\t", mode='a')

In [4]:
# Load original data
vcf_path =  "data/HLA_3_alleles/HLA1_chr6.ori.vcf"
rate = 0.2
data = read_vcf(vcf_path)
geno = data.iloc[:, 9::]
geno.rename(columns={'NA21144\n':'NA21144'}, inplace=True)

In [5]:
geno.head()

Unnamed: 0,HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00105,HG00106,HG00107,...,NA21128,NA21129,NA21130,NA21133,NA21135,NA21137,NA21141,NA21142,NA21143,NA21144
0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
3,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
4,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0


In [6]:
def split_columns(data):    
# Define the columns to split and the separator
    columns_to_split = data.columns
    df = pd.DataFrame()
    value = []
    # Split the specified columns
    for col in columns_to_split:
        try:
            # Split the column into multiple columns
            split_cols = data[col].str.split('|', expand=True)
        except:
            split_cols = data[col].str.split('/', expand=True)

        # Rename the split columns to avoid name clashes
        split_cols.columns = [f"{col}_{i+1}" for i in range(split_cols.shape[1])]
        
        # Concatenate the split columns back to the original DataFrame
        df = pd.concat([df, split_cols], axis=1)
        
    return(df)

In [7]:
aa = split_columns(geno)

In [8]:
aa

Unnamed: 0,HG00096_1,HG00096_2,HG00097_1,HG00097_2,HG00099_1,HG00099_2,HG00100_1,HG00100_2,HG00101_1,HG00101_2,...,NA21137_1,NA21137_2,NA21141_1,NA21141_2,NA21142_1,NA21142_2,NA21143_1,NA21143_2,NA21144_1,NA21144_2
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2387,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def haploid_to_diploid(df):    
    # Function to merge two columns with a separator
    def merge_columns(df, col1, col2):
        return df[col1].astype(str) + "|" + df[col2].astype(str)

    # List to hold the merged columns
    merged_columns = []
    l = df.shape[1] - 1

    # Loop through pairs of adjacent columns
    for i in range(0, l, 2):
        merged_col = np.core.defchararray.add(df[:, i].astype(str), '|' + df[:, i + 1].astype(str))
        merged_columns.append(merged_col)

    # Stack the merged columns into a new array and convert to DataFrame
    merged_array = np.stack(merged_columns, axis=1)
    merged_df = pd.DataFrame(merged_array)

    return merged_df

In [10]:
bb = haploid_to_diploid(aa.to_numpy())

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U1'), dtype('<U1')) -> None

In [None]:
bb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2494,2495,2496,2497,2498,2499,2500,2501,2502,2503
0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
3,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
4,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2386,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2387,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2388,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [None]:
with open('data/HLA_3_alleles/header.txt') as f:
    # Read the contents of the file into a variable
    header = f.read()

In [None]:
dt = data.iloc[:, 0:9]

In [None]:
vcf = pd.concat([dt, geno], axis = 1)
a = vcf.copy()

In [None]:
save_vcf(a, "data/HLA_3_alleles/test.vcf", header)

In [None]:
bb = merge(aa)

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
meta_data = pd.read_csv("data/HLA/igsr-1000 genomes on grch38.tsv", sep = "\t")

In [None]:
np.unique(list(meta_data["Superpopulation code"]))

array(['AFR', 'AMR', 'EAS', 'EUR', 'EUR,AFR', 'SAS'], dtype='<U7')

In [None]:
AFR = meta_data["Sample name"][meta_data["Superpopulation code"] == 'AFR']
AMR = meta_data["Sample name"][meta_data["Superpopulation code"] == 'AMR']
EAS = meta_data["Sample name"][meta_data["Superpopulation code"] == 'EAS']
EUR = meta_data["Sample name"][meta_data["Superpopulation code"] == 'EUR']
SAS = meta_data["Sample name"][meta_data["Superpopulation code"] == 'SAS']

In [None]:
pop = geno.columns
AFR_inter = set(AFR).intersection(set(pop)) 
geno_afr = geno.loc[:, list(AFR_inter)]

In [None]:
geno_afr

Unnamed: 0,HG03241,HG03133,NA19438,HG02582,NA19916,NA19395,NA19223,NA19443,HG03511,HG01912,...,NA19023,HG01956,HG02976,HG03559,HG02445,HG03246,HG02323,HG03198,NA19401,HG03557
0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
3,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
4,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2386,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2387,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2388,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [None]:
#geno, geno_miss, m = get_data(geno, rate)
#gn = pd.concat([data.iloc[:, 0:9], geno_miss], axis = 1)