In [15]:
import subprocess
from rpy2.robjects import r
from rc_data_class import RcData
from rc_folds_class  import rcFolds
from rc_pred_ann_model import PredAnnModel
from rc_individual_fold import RcFoldForANN
import numpy as np
import random
import pandas as pd
import random
import matplotlib.pyplot as plt
import pickle
import time

def get_genes_list(p_thresh, split_train):
    # Define the R script path
    r_script = "rc_get_diff_genes.r"
    
    # Build the command to run the R script
    command = ["Rscript", r_script, str(p_thresh), str(split_train)]
    
    result = subprocess.run(command, capture_output=True, text=True)
    
    # Check if the R script ran successfully
    if result.returncode == 0:
        print("R script executed successfully.")
    
        # Read the generated file
        rds_path = '/tmp/work/RCproject_code/sean_ann_python/ann_gene_set.rds'
        current_genes = r.readRDS(rds_path)
        print(len(current_genes))
        
    else:
        print("Error in R script execution:")
        print(result.stderr)

    return(current_genes.tolist())

def get_first_gen(n_pop,input_genes):
    first_gen = [];
    gene_list_length = len(input_genes)  # Length of the gene list
    for i in range(n_pop):
        binary_list = [random.choice([0, 1]) for _ in range(gene_list_length)]  # Create a binary list of the same length
        binary_array = np.array(binary_list)  # Convert binary list to a NumPy array
        first_gen.append(binary_array)

    return(first_gen)

def evaluate_individuals(input_gen,gene_list,folds,input_data,num_epochs):
    auc_averages = []
    current_folds = rcFolds(input_data,folds)
    current_gen_member = 1
    for gen_member in input_gen:
        filtered_list = np.array(genes_list)[gen_member == 1] # Use binary array to filter the genes list
        current_aucs = []
        for i in range(folds):
            current_fold = RcFoldForANN(current_folds,i)
            fold_count = i + 1
            print(f"Currently training, population member {current_gen_member}, with fold {fold_count}.")
            current_model = PredAnnModel(current_fold,current_genes, num_epochs = num_epochs)
            test_auc_list = current_model.test_auc_list
            current_auc = max(test_auc_list)
            current_aucs.append(current_auc)
        current_average_auc = np.mean(current_aucs)
        auc_averages.append(current_average_auc)
        current_gen_member += 1
    ga_df.loc[len(ga_df)] = auc_averages
    return auc_averages, input_gen

# Crossover function
def crossover_and_mutate(current_gen, current_aucs, mutation_rate):

    indices = np.argsort(current_aucs)[-2:][::-1]  # Sort, pick the last two, and reverse to get highest first
    parent1 = current_gen[indices[0]]
    parent2 = current_gen[indices[1]]
    
    # Ensure parents are the same size
    if len(parent1) != len(parent2):
        raise ValueError("Parents must have the same length.")

    # Randomly choose a crossover point
    crossover_point = random.randint(1, len(parent1) - 1)

    # Create children by combining parts of each parent
    child = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
    child = bit_flip_mutation(child, mutation_rate)

    return child

def bit_flip_mutation(individual, mutation_rate=0.01):
    # Go through each gene in the individual
    for i in range(len(individual)):
        if random.random() < mutation_rate:  # If random number is below mutation rate, flip the gene
            individual[i] = 1 - individual[i]  # Flip 0 to 1, or 1 to 0
    return individual

def plot_row_averages(df):
    """
    Plots the average of each row in the given DataFrame.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame containing numerical values.
    """
    row_averages = df.mean(axis=1)  # Compute the average across each row
    
    plt.figure(figsize=(10, 5))
    plt.plot(row_averages, marker='o', linestyle='-', color='b', label='Row Averages')
    
    plt.xlabel("Generation")
    plt.ylabel("Average Value")
    plt.title("Average of Each Row in DataFrame")
    plt.legend()
    plt.grid(True)
    
    plt.show()
    
ga_df = pd.DataFrame(columns = ['n_1','test_auc'])

population_size = 5
p_value = 0.1
split_train = True
folds = 5
num_epochs = 10
mutation_rate = 0.01
n_generations = 5

# making a data frame to keep track of GA progress
column_names = [f'auc_{i+1}' for i in range(population_size)]
# Initialize an empty DataFrame with columns
ga_df = pd.DataFrame(columns=column_names)


current_data = RcData()
genes_list = get_genes_list(p_value, split_train)
first_gen = get_first_gen(population_size,genes_list)

Data successfully loaded.
R script executed successfully.
910


In [16]:
current_folds = rcFolds(current_data,folds)
current_fold = RcFoldForANN(current_folds,1)

In [21]:
start_time = time.time()
current_model = PredAnnModel(current_fold,genes_list, num_epochs = num_epochs)
end_time = time.time()

print(end_time-start_time)

Epoch 0, Average Outcome Loss: 0.7035590277777778, Average Accuracy: 0.4791666666666667, Test AUC: 0.5000, Test Accuracy: 0.6486
8.72707486152649


In [38]:
import time
import threading

start_time = time.time()

# Shared dictionary to store results
results = {}

def initiate_fold(current_folds,genes_list,fold,fold_name):
    current_fold = RcFoldForANN(current_folds,0)
    current_model = PredAnnModel(current_fold, genes_list, num_epochs=num_epochs)
    results[fold_name] = current_model.test_auc_list  # Store result

t1 = threading.Thread(target=initiate_fold(current_folds,0,'first'))
t2 = threading.Thread(target=initiate_fold(current_folds,1,'second'))
t3 = threading.Thread(target=initiate_fold(current_folds,2,'third'))
t4 = threading.Thread(target=initiate_fold(current_folds,3,'fourth'))
t5 = threading.Thread(target=initiate_fold(current_folds,4,'fifth'))

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()

t1.join()
t2.join()
t3.join()
t4.join()
t5.join()

# Retrieve results
print(max(results['first']))
print(max(results['second']))
print(max(results['third']))
print(max(results['fourth']))
print(max(results['fifth']))

end_time = time.time()

Epoch 0, Average Outcome Loss: 0.6848415798611112, Average Accuracy: 0.5486111111111112, Test AUC: 0.5000, Test Accuracy: 0.6622
Epoch 0, Average Outcome Loss: 0.7014973958333334, Average Accuracy: 0.5555555555555556, Test AUC: 0.5000, Test Accuracy: 0.3378
Epoch 0, Average Outcome Loss: 0.7181532118055556, Average Accuracy: 0.4861111111111111, Test AUC: 0.5000, Test Accuracy: 0.3378
Epoch 0, Average Outcome Loss: 0.7392035590277778, Average Accuracy: 0.4930555555555556, Test AUC: 0.5000, Test Accuracy: 0.6622
Epoch 0, Average Outcome Loss: 0.7769097222222222, Average Accuracy: 0.5, Test AUC: 0.5000, Test Accuracy: 0.3378
0.5
0.693061224489796
0.5640816326530612
0.5
0.5


In [41]:
np.mean([
    max(results['first']),
    max(results['second']),
    max(results['third']),
    max(results['fourth']),
    max(results['fifth'])
])

0.5514285714285714

In [28]:
import time
import threading

start_time = time.time()

# Shared dictionary to store results
results = {}

# Function that runs a fold with a given fold number
def run_fold(fold_number):
    current_folds = rcFolds(current_data,fold_number)
    current_model = PredAnnModel(current_fold, genes_list, num_epochs=num_epochs)
    results[fold_number] = current_model.test_auc_list  # Store result

# Number of parallel folds
num_folds = 5  

# Create and start threads
threads = []
for i in range(num_folds):
    t = threading.Thread(target=run_fold, args=(i,))
    threads.append(t)
    t.start()

# Wait for all threads to finish
for t in threads:
    t.join()

# Retrieve results
for i in range(1, num_folds + 1):
    print(f"Max test AUC for fold {i}: {max(results[i])}")

end_time = time.time()
print(f"Total time: {end_time - start_time} seconds")

Exception in thread Thread-35 (run_fold):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 761, in run_closure
Exception in thread Thread-36 (run_fold):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    _threading_Thread_run(self)
  File "/opt/conda/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_335146/3105610512.py", line 11, in run_fold
    self.run()
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "/opt/conda/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_335146/3105610512.py", line 11, in run_fold
  File "/tmp/work/RCproject_code/sean_an

Epoch 0, Average Outcome Loss: 0.7706163194444444, Average Accuracy: 0.4652777777777778, Test AUC: 0.5000, Test Accuracy: 0.3514
Epoch 0, Average Outcome Loss: 0.7281901041666666, Average Accuracy: 0.5, Test AUC: 0.5000, Test Accuracy: 0.3514
Epoch 0, Average Outcome Loss: 0.7246636284722222, Average Accuracy: 0.5, Test AUC: 0.5000, Test Accuracy: 0.6486


KeyError: 1

In [26]:
print('test')

test


In [35]:
current_fold = RcFoldForANN(current_folds,4)

In [31]:
dir(current_folds)

['X',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'folds_count',
 'genes_list',
 'get_folds',
 'rc_data',
 'x_test_folds',
 'x_train_folds',
 'y',
 'y_stratify',
 'y_test_folds',
 'y_train_folds']

In [42]:
pool_size = multiprocessing.cpu_count() 

In [44]:
from multiprocessing import Pool

def f(x):
    return x*x

if __name__ == '__main__':
    with Pool(5) as p:
        print(p.map(f, [1, 2, 3]))

  self.pid = os.fork()


[1, 4, 9]


In [45]:
from multiprocessing import Process

def f(name):
    print('hello', name)

if __name__ == '__main__':
    p = Process(target=f, args=('bob',))
    p.start()
    p.join()


hello bob


In [49]:
from multiprocessing import Process

def f(name):
    print('hello', name)


def t():
    if __name__ == '__main__':
        p1 = Process(target=f, args=('bob',))
        p2 = Process(target=f, args=('jim',))
        p1.start()
        p2.start()
        p1.join()
        p2.join()

In [50]:
t()

hello hellobob
 jim


In [51]:
 print('hello', 'bob')
 print('hello', 'jim')

hello bob
hello jim


In [58]:
!top

[?1h=[H[2J[mtop - 18:00:55 up 89 days,  4:56,  0 users,  load average: 40.29, 32.70, 29.78[m[m[m[m[K
Tasks:[m[m[1m 159 [m[mtotal,[m[m[1m   4 [m[mrunning,[m[m[1m  16 [m[msleeping,[m[m[1m   0 [m[mstopped,[m[m[1m 139 [m[mzombie[m[m[m[m[K
%Cpu(s):[m[m[1m 39.9 [m[mus,[m[m[1m  9.6 [m[msy,[m[m[1m  0.0 [m[mni,[m[m[1m 46.9 [m[mid,[m[m[1m  0.9 [m[mwa,[m[m[1m  2.4 [m[mhi,[m[m[1m  0.4 [m[msi,[m[m[1m  0.0 [m[mst[m[m[m[m[K
MiB Mem :[m[m[1m 257534.6 [m[mtotal,[m[m[1m 189290.9 [m[mfree,[m[m[1m  27508.6 [m[mused,[m[m[1m  40735.1 [m[mbuff/cache[m[m[m[m[K
MiB Swap:[m[m[1m  73728.0 [m[mtotal,[m[m[1m  39217.8 [m[mfree,[m[m[1m  34510.2 [m[mused.[m[m[1m 227776.8 [m[mavail Mem [m[m[m[m[K
[K
[7m    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND  [m[m[K
[m 408474 root      20   0   23.5g   1.6g  55808 S 846.7   0.6   3:59.88 python   [m[m[K
