In [1]:
import numpy as np

In [2]:
def preprocess(entry1, entry2, entry3, entry4, entry5, entry6, entry7, entry8, entry9):
    """ Preprocesses data inputted by the user in the flask app.   
    
    The user inputs data in the form on the flask app. The data is then read and must be preprocessed before
    being used for prediction.
    The model was fit using scikit learn, so categorical variables need to be transformed into dummies
    for the user input to be used for prediction.
    The output of this function contains all necessary dummies and is ready to be fed into the model.

    Args:
        a_orig (list): A list with the user input read from the form.

    Returns:
        list: A list with the processed data.
    """

    #keep only the first 7 elements (5 numerical variables + 2 binary variables)
    mylist = [entry1, entry2, float(entry3), float(entry4),
                float(entry5), int(entry6), int(entry7)]
    #there are 10 possible categories (9 dummies) for the "department" variable
    #there are 3 possibile categories (2 dummies) for the "salary" variable
    #in total, 11 dummies ==> add 11 zeros, then will use dictionary to change the relevant dummy to 1
    mylist += [0]*11
    #Accounting is the reference category, so no dummy for it (to avoid perfect multicollinearity)
    if (str(entry8)!="drop"):
        mylist[int(entry8)] = 1
    #High is the reference category, so no dummy for it (to avoid perfect multicollinearity)
    if (str(entry9)!="drop"):
        mylist[int(entry9)] = 1
    return np.array([mylist])

In [9]:
def give_promotion(data):
    data[0][6] = 1
    if list(data[0][16:18]) == [1,0]:
        data[0][16:18] = [0,1]
    elif list(data[0][16:18]) == [0,1]:
        data[0][16:18] = [0,0]
    return data

In [16]:
x = preprocess(0.97, 0.6, 4, 262, 3, True, False, '13', 'drop')
print(x)
print(give_promotion(x))

[[   0.97    0.6     4.    262.      3.      1.      0.      0.      0.
     0.      0.      0.      0.      1.      0.      0.      0.      0.  ]]
[[   0.97    0.6     4.    262.      3.      1.      1.      0.      0.
     0.      0.      0.      0.      1.      0.      0.      0.      0.  ]]


Recall
- x[0][16] = 1 means low salary
- x[0][17] = 1 means medium salary
- both equal to 0 means high salary

In [19]:
#list(x[0][16:18]) == [0,1]
list(x[0][16:18])

[0.0, 1.0]

## Old

In [2]:
def preprocess(a_orig):
    bin_dict = {"Yes" : 1, "No" : 0}
    dept_dict = {"HR" : 7,
                 "IT" : 8,
                 "Management" : 9,
                 "Marketing" : 10,
                 "Product management" : 11,
                 "R&D" : 12,
                 "Sales" : 13,
                 "Support" : 14,
                 "Technical" : 15}
    salary_dict = {"Low" : 16,
                   "Medium" : 17}
    
    a = a_orig[0:7]
    a[5] = bin_dict[a[5]]
    a[6] = bin_dict[a[6]]
    a = a + [0]*11
    if (a_orig[7]!="Accounting"):
        a[dept_dict[a_orig[7]]] = 1
    if (a_orig[8]!="High"):
        a[salary_dict[a_orig[8]]] = 1
    a += [a[6]*a[3]]
    a += [a[6]*a[4]]
    return a

In [3]:
a_orig

[12.0, 4.0, 4.0, 1.0, 5.0, 'Yes', 'Yes', 'Technical', 'Medium']