In [1]:
import pandas as pd
import pickle

In [2]:
import statsmodels.api as sm
from statsmodels.formula.api import glm

In [3]:
from sklearn.preprocessing import StandardScaler


In [4]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [95]:
#finalgen_samples = pd.read_csv('../final_gen.csv')['sample_name']
first_gen = pd.read_csv('../key_files/generation_1_sample_names.txt',header=None)[0]
samples = first_gen.to_list()

#clim_sites_during_exp = pd.read_csv('/carnegie/nobackup/scratch/tbellagio/grene/data/bio')
clim_sites_during_exp = pd.read_csv('../key_files/bioclimvars_sites_era5_year_2018.csv')

sites_af = pd.Series(samples).str.split('_').str[0].astype(int)

sites_af.name = 'site'

In [96]:
unseen_sites = [7, 50, 56, 21, 41, 47]

# Convert the list to a Series and concatenate with the existing Series
new_series = pd.Series(unseen_sites, name='site')

In [97]:
sites_af = pd.concat([sites_af, new_series], ignore_index=True)

In [98]:
sites_af

0       1
1       1
2       1
3       1
4       1
       ..
327    50
328    56
329    21
330    41
331    47
Name: site, Length: 332, dtype: int64

In [99]:
env = sites_af.reset_index().merge(clim_sites_during_exp).drop(['index'],axis=1)

In [100]:
env = env.drop_duplicates()

In [103]:
env_variable = env['bio1']

In [104]:
# Standardize the environmental variable
scaler = StandardScaler()
env_variable_scaled = scaler.fit_transform(env_variable.values.reshape(-1, 1))

In [105]:
env_variable_scaled

array([[-0.63301035],
       [ 0.62998849],
       [ 0.9607133 ],
       [ 0.5065032 ],
       [ 0.490795  ],
       [-0.01764487],
       [ 1.89285253],
       [ 0.37374395],
       [ 0.10925582],
       [ 1.85806904],
       [-1.25761681],
       [-1.66067502],
       [-0.37155704],
       [ 2.35949902],
       [-1.61316253],
       [ 0.89775384],
       [ 1.19492958],
       [-0.26362888],
       [-1.28094926],
       [-0.68553582],
       [ 1.31430792],
       [ 0.63711906],
       [-0.38243202],
       [-1.02783177],
       [-0.2965911 ],
       [-0.20215664],
       [-0.13045425],
       [-0.08370172],
       [-0.39572256],
       [-0.15300913],
       [ 1.54619886],
       [-0.48562275],
       [-0.96105472],
       [-1.65026104],
       [-0.56820639],
       [-0.49354349],
       [-0.15736145]])

In [106]:
pd.DataFrame(env_variable_scaled).to_csv('env.csv',index=None)

In [108]:
env_site_scaled = pd.concat([env['site'],pd.Series(env_variable_scaled.flatten())],axis=1)

In [109]:
env_site_scaled.columns = ['site', 'env_scaled']

In [110]:
env_site_scaled.to_csv('env_site_scaled.csv',index=None)

In [112]:
import os

In [113]:
files = os.listdir('../baypass_first_gen/individual_gfiles/')

In [114]:
partitions = [int(file.split('_')[1].replace('.txt', '')) for file in files if '.txt' in file]

In [115]:
partitions.sort()

In [116]:
partitions[-1]

201

In [117]:
len(partitions)

202

In [118]:
pwd -P

'/carnegie/nobackup/scratch/tbellagio/gea_grene-net/binomial_regression_firstgen_go'

In [119]:
import random
import subprocess

In [154]:
samples = pd.read_csv('../key_files/merged_sample_table.csv')

In [155]:
samples = samples[samples['generation'] ==1]

In [156]:
samples = samples['sample_name'].to_list()

In [158]:
import pickle

def create_fake_cv(samples, unseen_sites):
    """
    Creates a fake cross-validation file where all samples are used for training
    and a single fake sample name is used for each unseen site in the test set.

    Parameters:
    samples (list): Original list of cross-validation splits.
    unseen_sites (list): List of unseen site numbers.

    Returns:
    list: New list of cross-validation splits with fake samples for unseen sites.
    """
    # Flatten the original sample list to get all unique samples
    #all_samples = sorted(set([sample for split in samples for sample in split[0]]))
    
    # Create the new fake splits
    fake_splits = []
    for site in unseen_sites:
        train = samples  # Use all samples for training
        # Create one fake sample name for each unseen site
        test = [f"{site}_1_1"]
        # Append the new split as a tuple of (train, test)
        fake_splits.append((train, test))
    
    return fake_splits


In [160]:
# Unseen site numbers
unseen_sites = [7, 50, 56, 21, 41, 47]

# Create fake cross-validation splits
fake_cv_splits = create_fake_cv(samples, unseen_sites)

# Save the fake splits to a new pickle file
output_path = '../jacknife_first_gen/fake_splits_samples_for_unseen_sites.pkl'
with open(output_path, 'wb') as file:
    pickle.dump(fake_cv_splits, file)

print(f"Fake cross-validation splits saved to {output_path}")

Fake cross-validation splits saved to ../jacknife_first_gen/fake_splits_samples_for_unseen_sites.pkl


In [161]:
file_path = '../jacknife_first_gen/fake_splits_samples_for_unseen_sites.pkl'

# Open and load the .pkl file
with open(file_path, 'rb') as file:
    samples_fake = pickle.load(file)

In [169]:
len(samples_fake)

6

In [165]:
len(samples_fake[0][0])

326

In [167]:
samples_fake[0][1]

['7_1_1']

In [168]:
samples_fake[1][1]

['50_1_1']

In [170]:
# Loop through splits 0 to 31
for split in range(len(samples)):
    # Define the folder path
    folder_path = f'results_sites_unseen/split_{split}'
    # Create the directory (makedirs allows creating intermediate directories if they don't exist)
    os.makedirs(folder_path, exist_ok=True)

In [171]:
pwd

'/carnegie/nobackup/scratch/tbellagio/gea_grene-net/binomial_regression_firstgen_go'

In [91]:
# create sbatch files to submit on cedar server
shfiles = []
for split in range(len(samples_fake)):
    for partition in partitions:
        seed = random.randint(1,100000000)
        file = f'shfiles/partition_{partition}_{split}.sh'
        cmd = f'python run_partition_binomial_reg_first_gen_unseen_sites.py {partition} {split}'
        text = f'''#!/bin/bash
#SBATCH --job-name=run_partition_binomial_reg{partition}_{split}
#SBATCH --time=1:00:00  # Time limit set to 4 hours
#SBATCH --ntasks=1
#SBATCH --mem-per-cpu=30gb
#SBATCH --output=run_partition_binomial_reg_{partition}_{split}_%j.out
#SBATCH --mail-user=tbellagio@carnegiescience.edu
#SBATCH --mail-type=FAIL

module load python/3.11_conda
conda activate /home/tbellagio/miniforge3/envs/pipeline_snakemake
export LD_LIBRARY_PATH="/home/tbellagio/miniforge3/envs/run_baypass/lib:$LD_LIBRARY_PATH"
cd /carnegie/nobackup/scratch/tbellagio/gea_grene-net/binomial_regression_firstgen_go
{cmd}

'''
        with open(file, 'w') as o:
            o.write("%s" % text)
        shfiles.append(file)

In [92]:
subprocess.run(["sbatch", shfiles[0]], check=True)

Submitted batch job 43078


CompletedProcess(args=['sbatch', 'shfiles/partition_0_0.sh'], returncode=0)

In [283]:
len(shfiles)

630

In [284]:
results/split_13/partition199
results/split_28/partition31.csv
results/split_29/partition0.csv

NameError: name 'split_13' is not defined

In [271]:
[i for i in shfiles if 'partition_199_13' in i]

['shfiles/partition_199_13.sh']

In [None]:
## now run the shfiles
for shfile in shfiles:
    # Submit each sbatch script to the SLURM scheduler
    subprocess.run(["sbatch", shfile], check=True)

In [238]:
files = os.listdir('../baypass_lastgen/individual_gfiles_last_gen/')
loci_names = [file for file in files if 'loci' in file]

In [95]:
for split in range(len(samples)):
    print(split)
    print(len(os.listdir(f'results/split_{split}')))

0
203
1
202
2
202
3
202
4
202
5
202
6
202
7
202
8
202
9
202
10
202
11
202
12
202
13
202
14
202
15
202
16
202
17
202
18
202
19
202
20
202
21
202
22
202
23
202
24
202
25
202
26
202
27
202
28
202
29
202
30
202


In [97]:
for split in range(len(samples)):
    print(split)
    partitions_r = {}
    for i in range(len(partitions)):  # /baypass_first_gen/individual_gfiles/
        pickle_file_path = f'../baypass_first_gen/individual_gfiles/loci_partition_{i}'
        with open(pickle_file_path, 'rb') as file:
            loci_f = pickle.load(file)
        results = pd.read_csv(f'results/split_{split}/partition{i}.csv')
        results['snp_id'] = loci_f
        partitions_r[i] = results
    results = pd.concat(partitions_r).reset_index(drop=True)
    results.to_csv(f'results/split_{split}/binomial_reg_results_last_gen.csv',index=None)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [66]:
all_loci = []
for i in loci_names:
    pickle_file_path = f'../baypass_lastgen/individual_gfiles_last_gen/{i}'
    with open(pickle_file_path, 'rb') as file:
        loci_f = pickle.load(file)
    all_loci.append(loci_f)

In [67]:
flattened_loci = [item for sublist in all_loci for item in sublist]

In [68]:
len(flattened_loci)

1048635

In [74]:
1054574 - 1055248

-674