# 43.9: Reformat covariate files for GCTA. 

In [1]:
import pandas as pd
import os

In [2]:
os.chdir("/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/")

In [3]:
import pandas as pd
import os

def prepare_gwas_covariates(input_csv, discrete_output, quantitative_output):
    """
    Prepares covariate files for GWAS analysis.
    
    Parameters:
    - input_csv (str): Path to the input CSV file.
    - discrete_output (str): Path to the output file for discrete covariates.
    - quantitative_output (str): Path to the output file for quantitative covariates.
    """
    # Read the input CSV
    df = pd.read_csv(input_csv, sep=',')
    
    # Print the head of the DataFrame and the column names
    print(f"Reading {input_csv}:")
    print("DataFrame Head:")
    print(df.head())
    print("\nColumn Names:")
    print(df.columns.tolist())
    
    # Ensure 'ID' is interpreted as FID and IID
    if 'ID' in df.columns:
        df['FID'] = 0
        df['IID'] = df['ID']
    else:
        raise KeyError("Column 'ID' not found in the file.")
    
    # Define columns
    discrete_cols = ['FID', 'IID', 'Dx', 'Sex']
    quantitative_cols = ['FID', 'IID', 'genoPC1', 'genoPC2', 'genoPC3'] + \
                        [f'methPC{i}' for i in range(1, 11)] + ['Age']

    # Convert categorical to numeric for discrete columns
    if 'Dx' in df.columns and 'Sex' in df.columns:
        df['Dx'] = pd.Categorical(df['Dx']).codes
        df['Sex'] = pd.Categorical(df['Sex']).codes
    else:
        raise KeyError("Necessary columns 'Dx' or 'Sex' not found in the file.")
    
    # Prepare discrete DataFrame
    discrete_df = df[discrete_cols]
    
    # Prepare quantitative DataFrame
    quantitative_df = df[quantitative_cols]
    
    # Save to CSV without header and index
    discrete_df.to_csv(discrete_output, sep='\t', index=False, header=False)
    quantitative_df.to_csv(quantitative_output, sep='\t', index=False, header=False)
    print(f"Processed {input_csv}:")
    print(f"- Discrete covariates saved to {discrete_output}")
    print(f"- Quantitative covariates saved to {quantitative_output}")

# Loop over files in the current directory
for filename in os.listdir('.'):
    if filename.endswith('.csv'):
        prefix = filename.split('.')[0]
        discrete_output = f'{prefix}.covar'
        quantitative_output = f'{prefix}.qcovar'
        prepare_gwas_covariates(filename, discrete_output, quantitative_output)

Reading EA_hippo.csv:
DataFrame Head:
       ID   genoPC1   genoPC2   genoPC3     methPC1     methPC2     methPC3  \
0  Br1016 -0.102730 -0.007874  0.002259 -582.250410 -195.066376   14.540472   
1  Br1023 -0.090669 -0.331012  0.001278  182.960586  -84.161823  111.237437   
2  Br1034 -0.101899  0.013660  0.005231 -548.376263 -191.801205 -114.255203   
3  Br1092 -0.100973 -0.000949 -0.000945  -41.944428  568.769599  198.333064   
4  Br1093 -0.101100  0.034861  0.029725  101.058254  355.618952   64.023480   

      methPC4     methPC5     methPC6     methPC7     methPC8    methPC9  \
0 -155.757748   88.609704 -259.841967   -5.449045   66.233979  -9.111428   
1 -121.846469 -108.675703    9.201356  114.092966  149.898635 -74.053124   
2  197.384645  -32.900962  -49.010551    9.324397  -35.902978 -76.886305   
3  -22.722300 -234.328217   -3.249619   35.845048   34.789159  77.225962   
4  161.421514 -132.173168   78.876031   -1.594505  -20.586247 -11.694494   

     methPC10       Dx    Age 

In [4]:
df

NameError: name 'df' is not defined