In [1]:
import os
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from sklearn.linear_model import LinearRegression as Lin_R
import dask.dataframe as dd

pd.options.mode.chained_assignment = None
tqdm.pandas()

In [2]:
def EI(x0, max_mean_y):
    
    mean_y_new, sigma_y_new = x0
    z = (mean_y_new - max_mean_y) / sigma_y_new
    exp_imp = (mean_y_new - max_mean_y) * norm.cdf(z) + sigma_y_new * norm.pdf(z)
    
    return exp_imp



def prediction_interval(inputs):
    model, X_train, y_train, X = inputs
    
    # Number of training samples
    n = X_train.shape[0]
    nbootstraps = 1000    
    bootstrap_preds = np.empty((X.shape[0],nbootstraps))
    
    #print('\nBootstrapping..')
    for b in tqdm(range(nbootstraps)):
        train_idxs = np.random.choice(range(n), size = n, replace = True)
        model.fit(X_train[train_idxs, :], y_train[train_idxs])
        bootstrap_preds[:,b] = model.predict(X)
    
    ddf = dd.from_array(bootstrap_preds)
    del bootstrap_preds
    mean = ddf.mean(axis=1)
    sd = ddf.std(axis=1)    
    return mean, sd


    
def get_best_alloy_EI(df_test,max_mean_y):
    
    dd_test = dd.from_pandas(df_test[['Tmean','Tsd']], npartitions=30)
    df_test['EI'] = dd_test.map_partitions(lambda df:df.apply((lambda x:EI(x,max_mean_y)),axis=1)).compute(scheduler='processes')

    df_test.sort_values(ascending=False,inplace=True,by=['EI'])
    
    return df_test.index[0], df_test['Tpred'].loc[df_test.index[0]]



def model_fit(df, df_test):
    X = df.drop(['Tp'], axis=1)
    y = df['Tp']
    searchspace = df_test[['en','ven','dor']]
    searchspace['dor2'] =  searchspace['dor'] ** 2
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_searchspace = scaler.transform(searchspace)
    
    model = Lin_R(fit_intercept = True, n_jobs = -1)
    model.fit(X, y)
    
    y_pred = model.predict(X_searchspace)
    max_mean_y = np.max(y)
    
    return X, y, model, X_searchspace, y_pred, max_mean_y


    
def train_loop(df ,df_test):
    X, y, model, X_searchspace, y_pred, max_mean_y = model_fit(df, df_test)
    mean, sd = prediction_interval([model, X, y, X_searchspace])
    
    mean = mean.compute().values
    sd = sd.compute().values
    
    del df, X, X_searchspace
    df_test['Tpred'] = y_pred
    df_test['Tmean'] = mean
    df_test['Tsd'] = sd
    
    best_alloy_index, t_pred = get_best_alloy_EI(df_test, max_mean_y)
    return best_alloy_index, t_pred


In [3]:
##if __name__ == "__main__":
#
#def main_function(df, df_test):
#    #df = pd.read_csv('training_data.csv', usecols=['en','ven','dor','Tp']) 
#    #df_test = pd.read_csv('search_space_2.csv')
#    df['dor2'] = df['dor'] ** 2
#    
#    best_alloy_index, optimal_temp = train_loop(df.copy(), df_test.copy())
#    
#    #get the best alloy composition.
#    df_test['predicted_target_variable'] = np.round(optimal_temp, 2)
#    optimal_alloy = df_test.iloc[best_alloy_index]
#    
#    return optimal_alloy

In [4]:
#train_df = pd.read_csv('training_data.csv', usecols=['en','ven','dor','Tp']) 
#test_df = pd.read_csv('search_space_2.csv')
#
#output = main_function(train_df, test_df)
#print(output)

In [5]:
#if __name__ == "__main__":
df = pd.read_csv('training_data.csv',usecols=['en','ven','dor','Tp'])
df['dor2'] = df['dor'] ** 2 

df_test = pd.read_csv('search_space_2.csv')

best_alloy_index = train_loop(df.copy(),df_test.copy())
#get the best alloy composition.
df_test['predicted_target_variable'] = np.round(optimal_temp, 2)
optimal_alloy = df_test.iloc[best_alloy_index]
print(optimal_alloy)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:06<00:00,  5.37it/s]


KeyboardInterrupt: 