In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [None]:
# Release of interest
rel = 10

# Create folders to mount
! mkdir releases release working

# Read Access only
! gcsfuse --dir-mode 555 --file-mode 444 --implicit-dirs gp2tier2_vwb releases
! gcsfuse --dir-mode 555 --file-mode 444 --implicit-dirs gp2_release{rel} release

# Read/Write Access
! gcsfuse --dir-mode 777 --file-mode 777 --implicit-dirs gp2_working_eu working

In [None]:
wd = '/YOUR/WORKING/DIR'

### PLINK File Prep

In [None]:
# Normalize variant names

# ! wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
user = 'nicole'
label = 'AJ'
rel = 10
region = 'non_PAR'

if region == 'non_PAR': 
    chrom = 'X'
else:
    chrom = 'PAR1, PAR2'

# Split females into regions for separate PAR and non-PAR analyses
female = f'{wd}/working/{user}/x_chrom_new/imputation/imputation_out_dir/{label}/female/chrX_{label}_rel{rel}_female'
female_out_path = f'{wd}/working/{user}/x_chrom_new/imputation/imputation_out_dir/{label}/female/{region}/chrX_{label}_rel{rel}_female_{region}'

os.chdir('/dev/shm')
! plink2 --pfile {female} --chr {chrom} --make-pgen --out {female_out_path}

In [None]:
# Run normalization
sex = 'female'
input = f'{wd}/working/{user}/x_chrom_new/imputation/imputation_out_dir/{label}/{sex}/{region}/chrX_{label}_rel{rel}_{sex}'

os.chdir('/dev/shm')
! plink2 --pfile {female_out_path} --normalize --fa {wd}/working/{user}/XWAS/hg38.fa.gz \
    --set-all-var-ids "chr@:#:\$r:\$a" \
    --new-id-max-allele-len 999 --sort-vars \
    --make-pgen --out {wd}/working/{user}/XWAS/{label}/{label}_{region}_{sex}_normalized

### PCs Prep

In [None]:
rel = 10
label = 'AJ'

# Using saved release-wide autosomal PCA results -- ADD PCA CALCULATION
eigenval_file = f"{wd}/releases/release{rel}/raw_genotypes/{label}/{label}_release{rel}_vwb.eigenval" # Male + Female results
eigenvals = pd.read_csv(eigenval_file, header=None, names=["eigenvalue"])

In [None]:
# Calculate % variance explained and cumulative variance
eigenvals["variance_explained"] = eigenvals["eigenvalue"] / eigenvals["eigenvalue"].sum() * 100
eigenvals["cumulative_variance"] = eigenvals["variance_explained"].cumsum()

# Find the first PC where cumulative variance >= 90
pc90 = (eigenvals["cumulative_variance"] >= 90).idxmax() + 1
var90 = eigenvals.loc[pc90 - 1, "cumulative_variance"]

# Create side-by-side plots
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True)

# Left: Variance explained
axes[0].plot(range(1, len(eigenvals) + 1), eigenvals["variance_explained"], marker="o")
axes[0].set_xlabel("Principal Component")
axes[0].set_ylabel("Variance Explained (%)")
axes[0].set_title("Variance Explained per PC")
axes[0].grid(True, linestyle="--", alpha=0.6)

# Right: Cumulative variance
axes[1].plot(range(1, len(eigenvals) + 1), eigenvals["cumulative_variance"], marker="s", linestyle="--")
axes[1].axhline(y=90, color="red", linestyle=":", label="90% Threshold")
axes[1].scatter(pc90, var90, color="red", zorder=5)
axes[1].text(pc90 + 0.5, var90 - 5, f"PC{pc90} ~ {var90:.1f}%", color="red")
axes[1].set_xlabel("Principal Component")
axes[1].set_ylabel("Cumulative Variance (%)")
axes[1].set_title("Cumulative Variance Explained")
axes[1].legend()
axes[1].grid(True, linestyle="--", alpha=0.6)

fig.suptitle(f"Scree Plot: {label} R{rel}", fontsize=14, y=1.05)

plt.tight_layout()
plt.show()

### Covar Prep

In [None]:
# Import release files
total_pheno = pd.read_csv(f'{wd}/releases/release{rel}/imputed_genotypes/{label}/chr1_{label}_release{rel}_vwb.psam', sep='\t')
total_pheno.rename(columns = {'#IID': 'IID'}, inplace = True)
display(total_pheno.head())

pcs = pd.read_csv(f"{wd}/releases/release{rel}/raw_genotypes/{label}/{label}_release{rel}_vwb.eigenvec", sep='\t')
display(pcs.head())

change = pd.read_csv(f'{wd}/releases/release{rel}/clinical_data/r{rel}_extended_clinical_data_vwb.csv')
change = change[['GP2ID','visit_month', 'age_at_baseline', 'primary_diagnosis', 'last_diagnosis']]
display(change.head())

related = pd.read_csv(f'{wd}/releases/release{rel}/meta_data/related_samples/{label}_release{rel}_vwb.related')
display(related.head())

master = pd.read_csv(f'{wd}/releases/release{rel}/clinical_data/master_key_release{rel}_final_vwb.csv')
display(master.head())

In [None]:
# Import XWAS prepped psam files
region = 'non_PAR'
sex = 'male'

input = f'{wd}/working/{user}/XWAS/{label}/{label}_{region}_{sex}_normalized'
pheno = pd.read_csv(f'{input}.psam', sep='\t') 

pheno['IID'] = pheno["#IID"].str.replace("^0_", "", regex=True)
pheno = pheno.merge(total_pheno[['IID', 'PHENO1']], on = 'IID', how = 'inner')
pheno = pheno[['IID', '#IID', 'SEX', 'PHENO1']]
display(pheno)

In [None]:
merge1 = pd.merge(pheno, pcs, on='IID')
display(merge1)

no_dup = change.dropna(subset=['last_diagnosis'])
no_dup['diagnosis_change'] = no_dup.apply(lambda row: 'No' if row['primary_diagnosis'] == row['last_diagnosis'] else 'Yes', axis=1)
d_change = no_dup[no_dup['diagnosis_change']=='Yes']
d_change.info()

anc_d = pd.merge(d_change, merge1, left_on='GP2ID', right_on='#IID')
anc_d.info()

diag_remove = anc_d['GP2ID']
anc_updated = merge1[~merge1['IID'].isin(diag_remove)]
anc_updated.info()

remove = related['IID1']
merge_norel = anc_updated[~anc_updated['IID'].isin(remove)]
merge_norel.info()

In [None]:
pop_control = master[master['baseline_GP2_phenotype']=='Population Control']
pop_control.head()

pop_change = pop_control['GP2ID']
merge_norel['PHENO1'][merge_norel['IID'].isin(pop_change)] = 1
merge_norel.info()

merge_pheno = merge_norel[~merge_norel['PHENO1'].isnull()]
merge_pheno.info()

In [None]:
test = pd.merge(master, merge_pheno, left_on='GP2ID', right_on='IID')
test.info()

bad = test[(test['study']=='BCM') & (test['PHENO1']==1)]
bad.head()

test = test[~test['GP2ID'].isin(bad['GP2ID'])]
test.info()

In [None]:
merge1_final = test[['IID', '#IID', 'PHENO1', 'SEX', 'age_of_onset', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']]
merge1_final.columns = ['IID', '#IID', 'PHENO1', 'SEX', 'AGE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
merge1_final.head()

merge1_final.fillna('-9', inplace=True)

pheno_final = merge1_final[['#IID', 'PHENO1']]
pheno_final.rename(columns={'PHENO1': 'PHENO'}, inplace = True)

merge1_final.drop(columns = ['IID', 'SEX', 'AGE', 'PHENO1'], inplace = True)

display(pheno_final)
display(merge1_final)

In [None]:
pheno_final.to_csv(f'{wd}/prep_files/{sex}_{label}_pheno.txt', sep='\t', index=False)
merge1_final.to_csv(f'{wd}/prep_files/{sex}_{label}_covar.txt', sep='\t', index=False)