In [2]:
import sys
import numpy as np
import pandas as pd
from io import *
from sklearn import linear_model


#init lib path
allforms_path = "allforms_data_clean.csv"

#########################################################
###     init column vars      ###
#identities
col_lib = 0
col_phos = 1
col_ion = 2
col_peg = 3
col_dna = 4

#formulation (percent comp)
col_per_phos = 5
col_per_chol = 6
col_per_ion = 7
col_per_peg = 8

#results
col_encap = 9
col_size = 10 #in nm
col_stdev = 11 #nm

###########################################################

#init input vars. Users will be prompted to enter a number corresponded to the reagent provided in the prompt
phospholipid = 1
ionizable_lipid = 2
lipid_PEG = 2
surface_DNA = 0
user_size = 0

###########################################################






In [1]:
#remove irrelevant entries if the formulation was unspecified

def remove_irrelevant(input_array, phos_input, ion_input, peg_input, dna_input): 
    query_array = input_array

    if phos_input == 0 & ion_input == 0 & peg_input == 0 & dna_input ==0:
        return(query_array)
    
    if phos_input != 0:
        query_array = input_array[input_array[:,col_phos] == phospholipid]
    if ion_input != 0:
        query_array = query_array[query_array[:,col_ion] == ionizable_lipid]
    if peg_input != 0:
        query_array = query_array[query_array[:,col_peg] == lipid_PEG]
    if dna_input != 0:
        query_array = query_array[query_array[:,col_dna] == surface_DNA]

    #Give a heads up
    if np.prod(query_array.shape) == 0:
        print("CAUTION: No training data for the combination of inputted reagents\n")
    
    return(query_array)
 


In [27]:
#training one or two models

def train_model(total, feature_start, feature_end, result_col):
    all_percentage_training = total[:,(feature_start+1):(feature_end)] #molar percentages of all formulations
    all_size_training = total[:,(result_col):(result_col+1)].flatten() #molar percentages of all formulations


    # create models and make fits, 'liblinear' used due to small dataset sizes
    all_model = linear_model.LogisticRegression(solver='liblinear')

    model = all_model.fit(all_percentage_training, all_size_training)

    return (model)

def train_two_models(refined, total, feature_start, feature_end, result_col):
    focus_percentage_training = refined[:,(feature_start+1):(feature_end)] #molar percentages of relevant formulations
    all_percentage_training = total[:,(feature_start+1):(feature_end)] #molar percentages of all formulations

    focus_size_training = refined[:,(result_col):(result_col+1)].flatten() #molar percentages of relevant formulations
    all_size_training = total[:,(result_col):(result_col+1)].flatten() #molar percentages of all formulations


    # create models and make fits, 'liblinear' used due to small dataset sizes
    focused_model = linear_model.LogisticRegression(solver='liblinear')
    all_model = linear_model.LogisticRegression(solver='liblinear')

    focus = focused_model.fit(focus_percentage_training, focus_size_training)
    broad = all_model.fit(all_percentage_training, all_size_training)

    return (focus, broad)

In [34]:
#main controller cell

#convert raw lib into numpy array
allforms = np.genfromtxt(allforms_path, delimiter=",", skip_header=1)

#isolate relevant data
query_array = remove_irrelevant(allforms, phospholipid, ionizable_lipid, lipid_PEG, surface_DNA)

if query_array.shape == allforms.shape:
    query_predictor = train_model(query_array)

else:
    query_predictor, allforms_predictor = train_two_models(query_array, allforms, col_dna, col_encap, col_size)


testarray = np.array([20, 50, 2.5, 2.5]).reshape(1,-1)
print(allforms_predictor.predict(testarray))

[277.]
