In [1]:
## Import needed mods
import pandas as pd, glob

In [2]:
## Set file paths to genotype and allleic read depth ratio data
dataframe_path = '../DATA/GENOTYPE/CDX_illumina_vcf/DFRAME'
gts_files = dataframe_path+'/DNX-Chr*-xl280genome-127-gt-df.csv.gz'
ads_files = dataframe_path+'/DNX-Chr*-xl280genome-127-ar-df.csv.gz'

In [3]:
## Set save path for genotype data
gtsavepath = dataframe_path+'/GT/CDx-ill-gvs.csv.gz'

## Set save path for allelic read depth data
adsavepath = dataframe_path+'/AD/CDx-ill-ads.csv.gz'

In [4]:
## Bring in genotype dataframes and combine them
gts_dfs = pd.concat([pd.read_csv(df,compression='gzip',
                                 header=0,index_col=0) 
                     for df in glob.glob(gts_files)],axis=0)

## Duplicate original VCF index
gts_dfs['Vcfix'] = gts_dfs.index.tolist()

## sort dataframes by chromosomes and positions
gts_dfs.sort_values(['Chrom','Pos'],inplace=True)

## Drop the index
gts_dfs.reset_index(drop=True,inplace=True)

## Check that we have 14 chromosomes
assert gts_dfs.Chrom.unique().shape[0] == 14

## Save the new dataframe
gts_dfs.to_csv(gtsavepath)

## Check shape
gts_dfs.shape

(599833, 137)

In [5]:
## Bring in allelic read depth dataframes and combine them
ads_dfs = pd.concat([pd.read_csv(df,compression='gzip',
                                 header=0,index_col=0) 
                     for df in glob.glob(ads_files)],axis=0)

## Set the vcf index
ads_dfs['Vcfix'] = ads_dfs.index.tolist()

## sort dataframes by chromosomes and positions
ads_dfs.sort_values(['Chrom','Pos'],inplace=True)

## Drop the old one
ads_dfs.reset_index(drop=True,inplace=True)

## Check that we have 14 chromosomes
assert ads_dfs.Chrom.unique().shape[0] == 14

## Save the file
ads_dfs.to_csv(adsavepath)

## Check shape
ads_dfs.shape

(599833, 137)