#### First, importing libraries and loading the model.

In [28]:
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)

# Pickle for reading model files
import pickle

# Scikit-Learn's train_test_split function
from sklearn.model_selection import train_test_split

# Area Under ROC Curve
from sklearn.metrics import roc_auc_score

#### Next, load the final model saved from the previous module.

In [29]:
# Load final_model.pkl as model
with open('final_model.pkl', 'rb') as f:
    clf = pickle.load(f)

## <span style="color:RoyalBlue"> 5.1 - Confirming My Model</span>

#### A.) Displaying the model object loaded earlier. I'm confirming a few key details.
* It should be a model <code style="color:steelblue">Pipeline</code>.
* The first step should be a <code style="color:steelblue">StandardScaler</code> preprocessing step.
* The second step should be a <code style="color:steelblue">RandomForestClassifier</code> model.

In [30]:
print(clf)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features=0.33,
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=123,
                                        verbose=0, warm_start=False))],
         verbose=Fals

#### B.) First, loading the same analytical base table that was imported at the beginning of Workbook 4.

In [31]:
# Load analytical base table used in Module 4
abt = pd.read_csv('analytical_base_table.csv')

#### C.) Next, splitting it into the exact same training and test sets used in Workbook 4.

In [32]:
# Create separate object for target variable
y = abt.status

# Create separate object for input features
X = abt.drop('status', axis=1)

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1234,
                                                    stratify=abt.status)

#### D.) Finally, using the model to predict <code style="color:steelblue">X_test</code> again.
* Then, printing the <code style="color:steelblue">roc_auc_score</code>.

In [33]:
# Predict X_test
pred = clf.predict_proba(X_test)

# Get just the prediction for the positive class (1)
pred = [p[1] for p in pred]

# Print AUROC
print( 'AUROC:', roc_auc_score(y_test, pred) )

AUROC: 0.9915194952019338


#### E.) Reading in <code style="color:crimson">'unseen_raw_data.csv'</code> and saving it to a new object called <code>raw_data</code>. Displaying the first 5 rows.

In [34]:
raw_data = pd.read_csv('unseen_employee_data.csv')

raw_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
0,228,management,,0.735618,2,,high,0.805661,3.0
1,229,product,,1.0,4,,low,0.719961,4.0
2,196,sales,1.0,0.557426,4,,low,0.749835,2.0
3,207,IT,,0.715171,3,,high,0.987447,3.0
4,129,management,,0.484818,2,,low,0.441219,3.0


#### F.) Trying to apply the model to this raw dataset. Trying to use model's <code style="color:steelblue">.predict_proba()</code> on this new, raw dataset.

In [35]:
# Should throw an error
pred = clf.predict_proba(raw_data)

ValueError: could not convert string to float: 'management'

<strong style="color:RoyalBlue">Expected ERROR:</strong>

<pre style="color:crimson">
\---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)

...

ValueError: could not convert string to float: 'low'
</pre>

Error was expected. I will go about fixing it in the next few steps.

## <span style="color:RoyalBlue"> 5.2 - Writing Pre-Modeling Functions</span>

All I need to do is write a few functions to **convert the raw data to the same format as the analytical base table**.
* That means I need to bundle together our data cleaning steps.
* Then I need to bundle together our feature engineering steps.
* I can skip the exploratory analysis steps because I didn't permanently alter our dataframe then. 
* Conveniently, I already have these steps centralized in <span style="color:royalblue">Workbook 2: ABT Construction</span>!

Let's start with data cleaning.

#### A.) Writing a function called <code style="color:steelblue">clean_data()</code> that bundles together all of the data cleaning steps.

In [36]:
def clean_data(df):
    # Drop duplicates
    df = df.drop_duplicates()
    
    # Drop temporary workers
    df = df[df.department != 'temp']
    
    # Missing filed_complaint values should be 0
    df['filed_complaint'] = df.filed_complaint.fillna(0)

    # Missing recently_promoted values should be 0
    df['recently_promoted'] = df.recently_promoted.fillna(0)
    
    # 'information_technology' should be 'IT'
    df.department.replace('information_technology', 'IT', inplace=True)

    # Fill missing values in department with 'Missing'
    df['department'].fillna('Missing', inplace=True)

    # Indicator variable for missing last_evaluation
    df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
    
    # Fill missing values in last_evaluation with 0
    df.last_evaluation.fillna(0, inplace=True)
    
    # Return cleaned dataframe
    return df

Excellent, now I can clean my raw <code style="color:steelblue">raw_data</code> dataframe the same way.

#### B.) Creating a new DataFrame named <code style="color:steelblue">cleaned_data</code> using the function I just wrote.

In [37]:
# Create cleaned_new_data 
cleaned_data = clean_data(raw_data)

# Display first 5 rows
cleaned_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing
0,228,management,0.0,0.735618,2,0.0,high,0.805661,3.0,0
1,229,product,0.0,1.0,4,0.0,low,0.719961,4.0,0
2,196,sales,1.0,0.557426,4,0.0,low,0.749835,2.0,0
3,207,IT,0.0,0.715171,3,0.0,high,0.987447,3.0,0
4,129,management,0.0,0.484818,2,0.0,low,0.441219,3.0,0


Now, let's do the same for feature engineering.

#### C.) Next, writing a function called <code style="color:steelblue">engineer_features()</code> that compiles all of the feature engineering steps.

In [38]:
def engineer_features(df):
    # Create indicator features
    df['underperformer'] = ((df.last_evaluation < 0.6) & 
                            (df.last_evaluation_missing == 0)).astype(int)

    df['unhappy'] = (df.satisfaction < 0.2).astype(int)

    df['overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)
        
    # Create new dataframe with dummy features
    df = pd.get_dummies(df, columns=['department', 'salary'])
    
    # Return augmented DataFrame
    return df


#### D.) Creating a new DataFrame named <code style="color:steelblue">augmented_data</code> using the function I just wrote.

In [39]:
# Create augmented_new_data
augmented_data = engineer_features(cleaned_data)

# Display first 5 rows
augmented_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,overachiever,department_IT,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0



#### E.) Predicting class probabilities for <code style="color:steelblue">augmented_data</code> using the model.

In [40]:
# Predict probabilities
pred = clf.predict_proba(augmented_data)

# Print first 5 predictions
print( pred[:5] )

[[1.   0.  ]
 [0.98 0.02]
 [1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]]


Error has been fixed! 

## <span style="color:RoyalBlue"> 5.3 - Constructing Custom Model Class</span>

Great, now let's package these functions together into a single **model class**. This is a convenient way to keep all of the logic for a given model in one place. Remember how when we were training our model, we imported <code style="color:steelblue">LogisticRegression</code> and <code style="color:steelblue">RandomForestClassifier</code> and <code style="color:steelblue">GradientBoostingClassifier</code>? We called them "algorithms," but they are technically **Python classes**. 

Python classes are structures that allow us to group related code, logic, and functions in one place. Those who remember our earlier discussion on object-oriented programming will have already recognized this concept. In this program, we only need to write some bare bones, very basic classes.

For example, each of those algorithms have the <code style="color:steelblue">fit()</code> and <code style="color:steelblue">predict_proba()</code> functions that allow you to train and apply models, respectively.

#### Custom Class: EmployeeRetentionModel

We're going to construct our own **custom Python class** for our employee retention model. 
* Thankfully, it doesn't need to be nearly as complex as those other algorithm classes because we're not actually using this to train the model. 
* Instead, we already have the model saved in a <code style="color:steelblue">final_model.pkl</code> file.
* We only need to include logic for cleaning data, feature engineering, and predicting new observations.

We'll show you a few pieces of code needed to declare the class, and then we'll have you fill in the rest.

To begin, we **name** our class, like so:

<pre style="color:steelblue">
class EmployeeRetentionModel:
</pre>

We use the <code style="color:steelblue">class</code> keyword to indicate that it's a class.

#### The <code>self.__init__()</code> Function

Then, we define a <code style="color:steelblue">self.\_\_init\_\_()</code> function. 

This is a special function that is automatically run whenever an **instance** of the class is first initialized (you'll see this in action in a few moments). We'll take this chance load our model file. Python will then store our file in the class instance, and you can use it in your other funcitons.


<pre style="color:steelblue">
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
</pre>

A few things to point out:
* Functions in classes must have <code style="color:steelblue">self</code> as the first argument.
* Likewise, when you call them from within the class, you need to prepend <code style="color:steelblue">self.</code> before the function name.
* The second argument, <code style="color:steelblue">model_location</code>, will be the file location of the saved final model.
* <code style="color:steelblue">self.model</code> is called an **instance attribute**. It's a variable that you set and can access elsewhere in the instance.

#### The <code>self.predict_proba()</code>  Function
Next, we'll define a <code style="color:steelblue">predict_proba()</code> function to apply our model to new data.

<pre style="color:steelblue">
    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if augment:
            X_new = self.engineer_features(X_new)
            
        return X_new, self.model.predict_proba(X_new)
</pre>

A few things to point out:
* We give the option to clean / engineer features or not. This allows us to handle data that has already been cleaned.
* Notice how we are calling <code style="color:steelblue">**self**.clean_data()</code>... When you call class functions, you need to prepend <code style="color:steelblue">self</code>.
* After cleaning and engineering features, we use the <code style="color:steelblue">self.model</code> instance attribute for prediction.
* We also return <code style="color:steelblue">X_new</code> so we have the cleaned and augmented version.

#### The <code>self.clean_data()</code> and <code>self.engineer_features()</code> Functions

Finally, let's add the <code style="color:steelblue">self.clean_data()</code> and <code style="color:steelblue">self.engineer_features()</code> functions. 

We've started you off with the code below.

#### A.) Adding the <code style="color:steelblue">self.clean_data()</code> and <code style="color:steelblue">self.engineer_features()</code> to the code below.

In [41]:
class EmployeeRetentionModel:
    
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if augment:
            X_new = self.engineer_features(X_new)
        
        return X_new, self.model.predict_proba(X_new)
    
    # Add functions here
    def clean_data(self, df):
        # Drop duplicates
        df = df.drop_duplicates()

        # Drop temporary workers
        df = df[df.department != 'temp']

        # Missing filed_complaint values should be 0
        df['filed_complaint'] = df.filed_complaint.fillna(0)

        # Missing recently_promoted values should be 0
        df['recently_promoted'] = df.recently_promoted.fillna(0)

        # 'information_technology' should be 'IT'
        df.department.replace('information_technology', 'IT', inplace=True)

        # Fill missing values in department with 'Missing'
        df['department'].fillna('Missing', inplace=True)

        # Indicator variable for missing last_evaluation
        df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)

        # Fill missing values in last_evaluation with 0
        df.last_evaluation.fillna(0, inplace=True)

        # Return cleaned dataframe
        return df
    
    def engineer_features(self, df):
        # Create indicator features
        df['underperformer'] = ((df.last_evaluation < 0.6) & 
                                (df.last_evaluation_missing == 0)).astype(int)

        df['unhappy'] = (df.satisfaction < 0.2).astype(int)

        df['overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)

        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=['department', 'salary'])

        # Return augmented DataFrame
        return df

## <span style="color:RoyalBlue"> 5.4 - Jupyter Notebook</span>


#### Applying Your Model Class

If you keep your model in Jupyter Notebook, you can directly use the model class you defined earlier.

First, simply initialize an instance of it:

In [42]:
# Initialize an instance
retention_model = EmployeeRetentionModel('final_model.pkl')

In [43]:
# Predict raw data
_, pred1 = retention_model.predict_proba(raw_data, clean=True, augment=True)

# Predict cleaned data
_, pred2 = retention_model.predict_proba(cleaned_data, clean=False, augment=True)

# Predict cleaned and augmented data
_, pred3 = retention_model.predict_proba(augmented_data, clean=False, augment=False)

#### Their predictions should all be equivalent.

In [44]:
# Should be true
np.array_equal(pred1, pred2) and np.array_equal(pred2, pred3)

True