In [None]:
import os
import glob
import pandas as pd
import numpy as np
import filenames
import csv
import json
import pickle
import joblib
from datetime import datetime, timedelta
from imblearn.over_sampling import SMOTE

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV


import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_rows', 500, 'display.max_columns', 500,
              'display.width', 1000)

### Data

use glob to get all the csv files in the raw data folder.

In [None]:
profile_files = filenames.profile_folder_path.glob(os.path.join("*.csv"))

profile_appended_data = []
# loop over the list of csv files
for f in profile_files:
    data = pd.read_csv(f)
    profile_appended_data.append(data)
#profile_appended_data

df = pd.concat(profile_appended_data)
df.reset_index(drop=True, inplace=True)

#### Drop duplicate userid

In [None]:
df = df.drop_duplicates(subset=['userid'], keep='last').reset_index(drop=True)

#### Create Label for Followers

In [None]:
fpath = filenames.followers_path
follower = []
with open(fpath, newline='') as f:
    for i in csv.reader(f):
        follower.append(i[0])
df['is_follower'] = df['username'].isin(follower).astype(int)

##### Drop Variables 

In [None]:
df.drop([
    'userid','followed_by_viewer', 'igtvcount', 'blocked_by_viewer',
    'follows_viewer', 'has_blocked_viewer', 'has_requested_viewer', 'external_url',
    'is_verified', 'requested_by_viewer', 'profile_pic_url', 'similar_accounts', 'business_category_name', 'biography', 'full_name'
],
        axis=1,
        inplace=True)


#### To share with others to try the code in this notebook

In [None]:
#df.to_csv("../../data/deidentified_profile_data.csv", index = False)

In [None]:
df1 = df.copy()

In [None]:
df1.head()

In [None]:
df1.shape

#### Boxplot using plotly
* To do - add to the dash application

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(
    y=df1['mediacount'],
    name="Media Count",
    jitter=0.3,
    pointpos=-1.8,
    boxpoints='all', # represent all points
    marker_color='rgb(7,40,89)',
    line_color='rgb(7,40,89)'
))


fig.add_trace(go.Box(
    y=df1['followees'],
    name="Followees",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
))

fig.update_layout(title_text="Box Plot Media Count & Followees")
fig.show()


In [None]:
fig = go.Figure()
fig.add_trace(go.Box(
    y=df1['followers'],
    name="Followers",
    boxpoints=False, # no data points
    marker_color='rgb(9,56,125)',
    line_color='rgb(9,56,125)'
))


fig.update_layout(title_text="Box Plot Followers")
fig.show()


In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y=df1['mediacount'],
    x=df1['is_follower'],
    name='Media Count',
    marker_color='#3D9970'
))
fig.add_trace(go.Box(
    y=df1['followees'],
    x=df1['is_follower'],
    name='Followees',
    marker_color='#FF4136'
))
fig.add_trace(go.Box(
    y=df1['followers'],
    x=df1['is_follower'],
    name='Followers',
    marker_color='#FF851B'
))

fig.update_layout(
    yaxis_title='Count',
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()

In [None]:
df1['is_follower'].value_counts()

#### Skewed and Imbalanced Data

### Shuffle the dataframe

In [None]:
df1 = shuffle(df1)
df1.reset_index(inplace=True, drop=True)

In [None]:
df1.replace({False: 0, True: 1}, inplace=True)

### Train Test Split

In [None]:
# Set Split Params
test_size = 0.33
random_state = 42

X = df1.drop(['is_follower'], axis = 1)
y = df1['is_follower'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


print(f'Train dataset size: {X_train.shape[0]} \n')
print(f'Test dataset size: {X_test.shape[0]}')
 

#### Drop username
Username is joined to predictions for batch inference

In [None]:
X_train = X_train.drop(['username'], axis = 1)
X_test_new = X_test.drop(['username'], axis = 1)

In [None]:
X.columns

### Preprocessing

The best practice is to imagine you have deployed your model, and it is being used to predict things. Imagine a single test case is provided to your model for testing, or your model tends to predict an input after deployment. 

In this scenario, you only have a single input, and therefore it doesn't make sense to use it as fitting data for a Standard Scaler which gets this single instance as its training (fitting) data, because in this case, the output of the scaler would be different in terms of scale with every other input. 

Thus, you'd better train (fit) the Standard Scaler using the training set (i.e. after splitting) and transform the rest of data (including the validation set, your test set, and whatever data comes into your model after deployment) using the fitted Standard Scaler.

Moreover, in every stage of a machine learning project, you'd better use only the training data for fitting and training whatever you need (e.g. scalers, predictors, regressors, etc.) and leave the validation and test data only for validation and testing.

For cross-validation case, you'd better fit a scaler and transform your data within cross-validation, but it generally doesn't make much difference. You can test it though.

#### Why Pipelines?
The machine learning workflow consists of many steps from data preparation (e.g., dealing with missing values, scaling/encoding, feature extraction). 

When first learning this workflow, we perform the data preparation one step at a time. This can become time consuming since we need to perform the preparation steps to both the training and testing data. 

Pipelines allow us to streamline this process by compiling the preparation steps while easing the task of model tuning and monitoring. Scikit-Learn’s Pipeline class provides a structure for applying a series of data transformations followed by an estimator 

[link](https://scikit-learn.org/stable/common_pitfalls.html)
[link](https://towardsdatascience.com/machine-learning-pipelines-with-scikit-learn-d43c32a6aa52)

In [None]:
num_selector = ['mediacount','followers','followees']
ordinal_selector = ['is_private', 'is_business_account', 'has_public_story']

num_processor = StandardScaler()
ordinal_processor = OrdinalEncoder()

preprocess = make_column_transformer(
    (ordinal_processor, ordinal_selector),
    (num_processor, num_selector) 
            
)

X_train = preprocess.fit_transform(X_train)


In [None]:
# transform test set
X_test_new = preprocess.transform(X_test_new)

In [None]:
len(X_test_new)

### Remove outliers 

Not all data is normal or normal enough to treat it as being drawn from a Gaussian distribution. A good statistic for summarizing a non-Gaussian distribution sample of data is the Interquartile Range, or IQR for short. 

#### Unsupervised Outlier Detection using the Local Outlier Factor (LOF).

The anomaly score of each sample is called the Local Outlier Factor. It measures the local deviation of the density of a given sample with respect to its neighbors. 

It is local in that the anomaly score depends on how isolated the object is with respect to the surrounding neighborhood. 

More precisely, locality is given by k-nearest neighbors, whose distance is used to estimate the local density. 

By comparing the local density of a sample to the local densities of its neighbors, one can identify samples that have a substantially lower density than their neighbors. 

These are considered outliers.

In [None]:
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

### Imbalanced Dataset

Some common over-sampling and under-sampling techniques in imbalanced-learn are imblearn.over_sampling.RandomOverSampler, imblearn.under_sampling.RandomUnderSampler, and imblearn.SMOTE. 

For these libraries there is a nice parameter that allows the user to change the sampling ratio.

For example, in SMOTE, to change the ratio you would input a dictionary, and all values must be greater than or equal to the largest class (since SMOTE is an over-sampling technique). 

The reason I have found SMOTE to be a better fit for model performance is probably because with RandomOverSampler you are duplicating rows, which means the model can start to memorize the data rather than generalize to new data. 

SMOTE uses the K-Nearest-Neighbors algorithm to make "similar" data points to those under sampled ones.

It is not good practice to blindly use SMOTE, setting the ratio to it's default (even class balance) because the model may overfit one or more of the minority classes (even though SMOTE is using nearest neighbors to make "similar" observations). 

In a similar way that you tune hyperparameters of a ML model you will tune the hyperparameters of the SMOTE algorithm, such as the ratio and/or knn. 

NOTE: It is vital that you do not use SMOTE on the full data set. You MUST use SMOTE on the training set only (after you split). Then validate on your val/test sets and see if your SMOTE model out performed your other model(s). 

If you do not do this there will be data leakage and your model is essentially cheating.

[link](https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/)


In [None]:
# Total of 1 before SMOTE
y_train.sum()

In [None]:
sm = SMOTE(random_state=0, n_jobs=8 , sampling_strategy='minority', k_neighbors=7)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# Total of 1 after SMOTE
y_train.sum()

### Modelling

Training a model simply means learning (determining) good values for all the weights and the bias from labeled examples. 

In supervised learning, a machine learning algorithm builds a model by examining many examples and attempting to find a model that minimizes loss; this process is called empirical risk minimization.

Loss is the penalty for a bad prediction. That is, loss is a number indicating how bad the model's prediction was on a single example. If the model's prediction is perfect, the loss is zero; otherwise, the loss is greater. 

The goal of training a model is to find a set of weights and biases that have low loss, on average, across all examples.

##### Gaussian NB

In [None]:
gnb = GaussianNB()
# Fit on the training data
gnb_model = gnb.fit(X_train, y_train)
# Predict on the testing data
predictions=gnb_model.predict(X_test_new)
probabilities = gnb_model.predict_proba(X_test_new)[:,1]
# Calculate the roc-auc score
auc_nb=metrics.roc_auc_score(y_test, predictions)
acc_nb = metrics.accuracy_score(y_test, predictions)
f1_nb = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_nb,4))
print('Accuracy', "%.4f" % round(acc_nb,4))
print('AUC Score', "%.4f" % round(auc_nb,4))

##### Logistic Regression

In [None]:
logreg = LogisticRegression()
# Fit on the training data
log_model=logreg.fit(X_train, y_train)
# Predict on the testing data
predictions=log_model.predict(X_test_new)
probabilities = log_model.predict_proba(X_test_new)[:,1]
# Calculate the roc-auc score
auc_log=metrics.roc_auc_score(y_test, predictions)
acc_log = metrics.accuracy_score(y_test, predictions)
f1_log = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_log,4))
print('Accuracy', "%.4f" % round(acc_log,4))
print('AUC Score', "%.4f" % round(auc_log,4))

##### KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
# Fit on the training data
knn_model=knn.fit(X_train, y_train)
# Predict on the testing data
predictions=knn_model.predict(X_test_new)
probabilities = knn_model.predict_proba(X_test_new)[:,1]
# Calculate the roc-auc score
auc_knn=metrics.roc_auc_score(y_test, predictions)
acc_knn = metrics.accuracy_score(y_test, predictions)
f1_knn = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_knn,4))
print('Accuracy', "%.4f" % round(acc_knn,4))
print('AUC Score', "%.4f" % round(auc_knn,4))

##### RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
# Fit on the training data
rf_model=rf.fit(X_train, y_train)
# Predict on the testing data
predictions=rf_model.predict(X_test_new)
probabilities = rf_model.predict_proba(X_test_new)[:,1]
# Calculate the roc-auc score
auc_rf=metrics.roc_auc_score(y_test, predictions)
acc_rf = metrics.accuracy_score(y_test, predictions)
f1_rf = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_rf,4))
print('Accuracy', "%.4f" % round(acc_rf,4))
print('AUC Score', "%.4f" % round(auc_rf,4))

#### HistGradientBoostingClassifier

[link](https://machinelearningmastery.com/histogram-based-gradient-boosting-ensembles/)

In [None]:
hist = HistGradientBoostingClassifier()
# Fit on the training data
hist_model=hist.fit(X_train, y_train)
# Predict on the testing data
predictions=hist_model.predict(X_test_new)
probabilities = hist_model.predict_proba(X_test_new)[:,1]
# Calculate the roc-auc score
auc_hist=metrics.roc_auc_score(y_test, predictions)
acc_hist = metrics.accuracy_score(y_test, predictions)
f1_hist = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_hist,4))
print('Accuracy', "%.4f" % round(acc_hist,4))
print('AUC Score', "%.4f" % round(auc_hist,4))

### Comparison of 5 different Models

In [None]:
# create lists from the metrics we produced.
f1=[f1_nb, f1_log, f1_knn, f1_rf, f1_hist]
acc=[acc_nb, acc_log, acc_knn, acc_rf, acc_hist]
auc=[auc_nb, auc_log, auc_knn, auc_rf, auc_hist]

# Define a function that will round our metrics.
def rounder(metric):
    scores_list=[]
    for score in metric:
        scores_list.append(round(float(score*100),1))
    return scores_list

# Apply it to each of the three lists.
f1_scores=rounder(f1)
acc_scores=rounder(acc)
auc_scores=rounder(auc)
score_types=['F1 score', 'Accuracy', 'AUC score']

In [None]:
# Comparison of model metrics
models=['naive bayes', 'logistic regression', 'k-nearest neighbors', 'random forest', 'hist gradient boosting']
index=['F1 score', 'Accuracy', 'AUC score']
compare_models=pd.DataFrame([f1_scores, acc_scores, auc_scores], index=index, columns=models)
compare_models

In [None]:
# save to csv, for later use by plotly dash app.
compare_models.to_csv('../../resources/compare_models.csv', index=True)
pd.read_csv('../../resources/compare_models.csv', index_col=0)

In [None]:
# Let's display that with plotly.
fig = go.Figure()

fig.add_trace(go.Bar(
    x=compare_models.loc['F1 score'].index,
    y=compare_models.loc['F1 score'],
    name=compare_models.index[0],
    marker_color='rgb(107,174,214)'
))

fig.add_trace(go.Bar(
    x=compare_models.loc['Accuracy'].index,
    y=compare_models.loc['Accuracy'],
    name=compare_models.index[1],
    marker_color='rgba(219, 64, 82, 0.6)'
))

fig.add_trace(go.Bar(
    x=compare_models.loc['AUC score'].index,
    y=compare_models.loc['AUC score'],
    name=compare_models.index[2],
    marker_color='rgb(7,40,89)'
))

fig.update_layout(
    title='Comparison of Possible Models',
    xaxis = dict(title = 'Predictive models'), # x-axis label
    yaxis = dict(title = 'Score'), # y-axis label
    
)

fig

##### Random Forest has the best performance

### Tuning Random Forest Using Grid Search

When creating a machine learning model, you'll be presented with design choices as to how to define your model architecture. Often times, we don't immediately know what the optimal model architecture should be for a given model, and thus we'd like to be able to explore a range of possibilities. 

In true machine learning fashion, we'll ideally ask the machine to perform this exploration and select the optimal model architecture automatically. 

Parameters which define the model architecture are referred to as hyperparameters and thus this process of searching for the ideal model architecture is referred to as hyperparameter tuning.

These hyperparameters might address model design questions such as:

* What degree of polynomial features should I use for my linear model?
* What should be the maximum depth allowed for my decision tree?
* What should be the minimum number of samples required at a leaf node in my decision tree?
* How many trees should I include in my random forest?
* How many neurons should I have in my neural network layer?
* How many layers should I have in my neural network?
* What should I set my learning rate to for gradient descent?

Hyperparameters are not model parameters and they cannot be directly trained from the data. Model parameters are learned during training when we optimize a loss function using something like gradient descent.

Whereas the model parameters specify how to transform the input data into the desired output, the hyperparameters define how our model is actually structured.

In general, this process includes:
1. Define a model
2. Define the range of possible values for all hyperparameters
3. Define a method for sampling hyperparameter values
4. Define an evaluative criteria to judge the model
5. Define a cross-validation method

#### Grid search
Grid search is arguably the most basic hyperparameter tuning method. With this technique, we simply build a model for each possible combination of all of the hyperparameter values provided, evaluating each model, and selecting the architecture which produces the best results.

* https://www.jeremyjordan.me/hyperparameter-tuning/


In [None]:
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [10, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

# Create grid search using 5-fold cross validation
grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,  n_jobs = 1, verbose=0)
grid_rfc.fit(X_train, y_train)

In [None]:
print(grid_rfc.best_params_)
model = grid_rfc

### Predict on the testing data

After the model is built, testing data once again validates that it can make accurate predictions. If training and validation data include labels to monitor performance metrics of the model, the testing data should be unlabeled. Test data provides a final, real-world check of an unseen dataset to confirm that the ML algorithm was trained effectively.

* https://www.applause.com/blog/training-data-validation-data-vs-test-data

In [None]:
predictions=model.predict(X_test_new)
probabilities = model.predict_proba(X_test_new)[:,1]

In [None]:
# Pickle the final model for use in the plotly dash app.
file = open('../../resources/final_model.pkl', 'wb')
pickle.dump(model, file)
file.close()

### Final Model Metrics

In [None]:
# Full list of metrics
def model_metrics(y_test, predictions):
    '''
    Calculate 5 standard model metrics
    Return a dictionary with the metrics
    '''
    f1 = metrics.f1_score(y_test, predictions)
    accuracy = metrics.accuracy_score(y_test, predictions)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    rocauc =  metrics.roc_auc_score(y_test, predictions)
    return {'precision': precision, 'recall': recall,'f1 score':f1, 'accuracy': accuracy, 'error rate': error,  'ROC-AUC': rocauc}

eval_scores=model_metrics(y_test, predictions)
eval_scores

In [None]:
# Round the y values.
y_vals=[]
for val in list(eval_scores.values()):
    y_vals.append(round(float(val*100),1))
y_vals    
# Write over the previous dictionary with the rounded values.
eval_scores=dict(zip(eval_scores.keys(), y_vals))
print(eval_scores)

In [None]:
# Now save that dictionary to a pickle file, for later use in plotly dash app
file = open('../../resources/eval_scores.pkl', 'wb')
pickle.dump(eval_scores, file)
file.close()

In [None]:
# And here's a reminder of how to read that back in again, just in case this is unfamiliar:
file = open('../../resources/eval_scores.pkl', 'rb')
evals=pickle.load(file)
file.close()
evals

In [None]:
X_test.shape

In [None]:
# Convert that into a visualization.
fig = go.Figure()

fig.add_trace(go.Bar(
    x=list(evals.keys()),
    y=list(evals.values())
))

fig.update_traces(marker_color='rgb(107,174,214)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)

fig.update_layout(
    title='Evaluation Metrics for Random Forest Model (Testing Dataset = 578 profiles)',
    xaxis = {'title': 'Metrics'},
    yaxis = {'title': 'Percent'}, 

)

fig

#### Precison and Recall

Precision is how many of the returned hits were true positive i.e. how many of the found were correct hits.
Recall literally is how many of the true positives were recalled (found), i.e. how many of the correct hits were also found.


In [None]:
FPR, TPR, _ = roc_curve(y_test, probabilities)
#FPR

#### ROC curve
An ROC curve (receiver operating characteristic curve) is a graph showing the performance of a classification model at all classification thresholds. This curve plots two parameters:
* True Positive Rate
* False Positive Rate

True Positive Rate (TPR) is a synonym for recall.

An ROC curve plots TPR vs. FPR at different classification thresholds. Lowering the classification threshold classifies more items as positive, thus increasing both False Positives and True Positives. 

[link](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#:~:text=An%20ROC%20curve%20(receiver%20operating,False%20Positive%20Rate)

In [None]:
roc_score=round(100*roc_auc_score(y_test, predictions),1)
roc_score

In [None]:
roc_dict={'FPR':list(FPR), 
          'TPR':list(TPR),
          'y_test':[int(i) for i in y_test], 
          'predictions':[int(i) for i in predictions]
         }

In [None]:
# Save everything we need to reproduce the ROC-AUC figure in plotly dash.
with open('../../resources/roc_dict.json', 'w') as f:
    json.dump(roc_dict, f)

In [None]:
with open('../../resources/roc_dict.json') as json_file:
    roc_dict = json.load(json_file)
FPR=roc_dict['FPR']
TPR=roc_dict['TPR']
y_test=pd.Series(roc_dict['y_test'])
predictions=roc_dict['predictions']

In [None]:
# ROC-AUC figure
roc_score=round(100*roc_auc_score(y_test, predictions),1)
fig = go.Figure()

fig.add_trace(go.Scatter(
        x=FPR, 
        y=TPR,
        mode='lines',
        name=f'AUC: {roc_score}',
        marker_color='rgb(150,150,150)'
        ))
fig.add_trace(go.Scatter(
        x=[0,1], 
        y=[0,1],
        mode='lines',
        name='Baseline Area: 50.0',
        marker_color='rgb(37,37,37)'
        ))
fig.update_layout(
    title='Receiver Operating Characteristic (ROC): Area Under Curve',
    xaxis={'title': 'False Positive Rate (100-Specificity)','scaleratio': 1,'scaleanchor': 'y'},
    yaxis={'title': 'True Positive Rate (Sensitivity)'}
    )

fig.show()

#### Confusion Matrix

A confusion matrix is a summary of prediction results on a classification problem. The number of correct and incorrect predictions are summarized with count values and broken down by each class

In [None]:
# A confusion matrix tells us our false positives and false negatives:
matrix=confusion_matrix(y_test, predictions)
print(matrix)
cm=pd.DataFrame(matrix, columns=['pred: follower', 'pred: non-follower'])
cm[f'n={len(y_test)}']=['actual: follower', 'actual: non-follower']
cm=cm[[f'n={len(y_test)}', 'pred: follower', 'pred: non-follower']]
cm

In [None]:
# Save cm dataframe to a pickle file, for later use in plotly dash app
cm.to_csv('../../resources/confusion_matrix.csv', index=False)
cm=pd.read_csv('../../resources/confusion_matrix.csv')
cm

In [None]:
# Display the confusion matrix as a formatted table with Plotly
fig = go.Figure()

fig.add_trace(go.Table(
    header=dict(values=cm.columns,
                line = dict(color='rgb(150,150,150)'),
                fill = dict(color='rgb(150,150,150)'),
                align = ['left'] * 5),
    cells=dict(values=[cm[f'n={len(y_test)}'], cm['pred: follower'], cm['pred: non-follower']],
               line = dict(color='#7D7F80'),
               fill = dict(color='white'),
               align = ['left'] * 5)))

fig.update_layout(
    title = f'Confusion Matrix: Random Forest Model (Testing Dataset)'
)


fig

### Random Forest Feature Importance

The feature importance (variable importance) describes which features are relevant. It can help with better understanding of the solved problem and sometimes lead to model improvements by employing the feature selection.

The Random Forest algorithm has built-in feature importance which can be computed in two ways:

##### Gini importance (or mean decrease impurity), which is computed from the Random Forest structure. 

Let’s look how the Random Forest is constructed. It is a set of Decision Trees. 

Each Decision Tree is a set of internal nodes and leaves. In the internal node, the selected feature is used to make decision how to divide the data set into two separate sets with similars responses within. 

The features for internal nodes are selected with some criterion, which for classification tasks can be gini impurity or infomation gain, and for regression is variance reduction. We can measure how each feature decrease the impurity of the split (the feature with highest decrease is selected for internal node). 

For each feature we can collect how on average it decreases the impurity. The average over all trees in the forest is the measure of the feature importance.

This biggest advantage of this method is a speed of computation - all needed values are computed during the Radom Forest training. The drawbacks of the method is to tendency to prefer (select as important) numerical features and categorical features with high cardinality. What is more, in the case of correlated features it can select one of the feature and neglect the importance of the second one. 

##### Mean Decrease Accuracy - is a method of computing the feature importance on permuted out-of-bag (OOB) samples based on mean decrease in the accuracy. 

This method is not implemented in the scikit-learn package. The very similar to this method is [permutation based importance](https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance).

The permutation based importance is computationally expensive. The permutation based method can have problem with highly-correlated features, it can report them as unimportant.

Feature Importance Computed with SHAP Values
model-agnostic and works well with algorithms not from scikit-learn: Xgboost, Neural Networks (keras+tensorflow), LigthGBM, CatBoost. It can provide more information like decision plots or dependence plots.

[link](https://mljar.com/blog/feature-importance-in-random-forest/#:~:text=Random%20Forest%20Built%2Din%20Feature%20Importance&text=It%20is%20a%20set%20of,sets%20with%20similars%20responses%20within.)

In [None]:
model.best_estimator_.feature_importances_

In [None]:
X_test.columns

In [None]:
# Feature importance (Random Forest)
coeffs1=pd.DataFrame(list(zip(list(X_test.columns), model.best_estimator_.feature_importances_)), columns=['feature', 'coefficient'])
coeffs=coeffs1.sort_values(by='coefficient', ascending=False)

# Format the coefficients.
y_vals=[]
for val in list(coeffs['coefficient']):
    y_vals.append(round(float(val),2))
y_vals

coeffs['coefficient']=y_vals
coeffs

In [None]:
# save the results to a csv file, for later use by plotly dash app.
coeffs.to_csv('../../resources/coefficients.csv', index=False)

In [None]:
# Let's display that with Plotly.
fig = go.Figure()

fig.add_trace(go.Bar(
    x=coeffs['feature'],
    y=coeffs['coefficient']
))

fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)

fig.update_layout(
    title='Number of Followers is a good indication of becoming a follower than a business account ',
    xaxis = {'title': 'Instagram Features'},
    yaxis = {'title': 'Odds of Becoming a Follower'}, 

)

fig

In [None]:
print(len(probabilities))
print(len(predictions))
print(len(y_test))
print(len(X_test))

### Merge usernames to probabilities

One of the most important things you can do before deploying a model is try to understand model drift in an offline environment. 

Data scientists should seek to answer the question "If I train a model using this set of features on data from six months ago, and I apply it to data that I generated today, how much worse is the model than the one that I created untrained off of data from a month ago and applied to today?". 

Performing this analysis offline allows you to estimate the rate at which a model’s performance falls off and how often you’ll need to retrain.


In [None]:
probs=pd.DataFrame(probabilities, columns=['follower_probability'])
probs.shape

In [None]:
actual_df = pd.DataFrame(y_test, columns=['actual'])

In [None]:
X_test = X_test.reset_index(drop = True)
X_test.head(1)

In [None]:
X_test['is_private'].dtype

In [None]:
final = pd.concat([X_test, probs, actual_df], axis = 1)

In [None]:
# save to display in plotly dash app
final.to_csv('../../resources/final_probs.csv', index=False)

In [None]:
mydata=final.drop(['follower_probability'], axis=1)

fig = go.Figure()

fig.add_trace(go.Table(
        header=dict(values=list(mydata.columns)),
        cells=dict(values=list(mydata.loc[5]))))
fig

### Train the selected model on whole dataset before putting into production

Theoretically, the more data your deployed model has seen, the better is should generalise. So if you trained the model on the full set of data you have available, it should generalise better than a model which only saw for example train/val sets (e.g. ~ 90%) from the full data set.

For a given model (a functional form), changing the sample size will only affect Var(f^(x0)); namely, increasing the sample will diminish it. Meanwhile, Bias2(f^(x0)) will stay the same as the functional form f^(⋅) is fixed. (Clearly, the irreducible error also stays the same.)

You reduce the expected squared error by reestimating the chosen model on the full sample as compared to having estimated it on just the training sample.)

Unless you're limiting yourself to a simple class of convex models/loss functions, you're considerably better off keeping a final test split.

[link](https://stats.stackexchange.com/questions/225820/is-it-needed-to-train-the-selected-model-again-on-entire-data-before-putting-in)

[link](https://datascience.stackexchange.com/questions/33008/is-it-always-better-to-use-the-whole-dataset-to-train-the-final-model)

In [None]:
X_all = X.drop('username', axis = 1)

#### Preprocess on all the data

In [None]:
X_full = preprocess.fit_transform(X_all)

In [None]:
# save to use later for individual datapoint or for batch
joblib.dump(preprocess, "../../resources/preprocess.joblib")

#### Fit model on all the data

In [None]:
model.fit(X_full, y)

### Predictions

Machine learning prediction and inference are two different aspects of machine learning. 

Prediction is the ability to accurately predict a response variable while inference deals with understanding the relationship between predictor variables and response variables. The difference in prediction vs inference models can be seen in examples such as predicting marketing campaign success or understanding how media influences sales on promotions, etc. 

Inference: You want to find out what the effect of Age, Passenger Class and, Gender has on surviving the Titanic Disaster. You can put up a logistic regression and infer the effect each passenger characteristic has on survival rates.

Prediction: Given some information on a Titanic passenger, you want to choose from the set {lives,dies} and be correct as often as possible. 

You may use linear regression for an inference model while non-linear methods work best when prediction is your objective.

[link](https://stats.stackexchange.com/questions/244017/what-is-the-difference-between-prediction-and-inference)
[link](https://cloud.google.com/ai-platform/prediction/docs/overview)

In [None]:
# use glob to get all the csv files in the raw data folder.
prediction_files = filenames.others_profile_folder_path.glob(os.path.join("*.csv"))

prediction_appended_data = []
# loop over the list of csv files
for f in prediction_files:
    data = pd.read_csv(f)
    prediction_appended_data.append(data)

df_prediction = pd.concat(prediction_appended_data)
df_prediction.reset_index(drop=True, inplace=True)

In [None]:
df_prediction.shape

In [None]:
#### Drop variables

In [None]:
dfi2 = df_prediction.drop([
    'userid', 'username','followed_by_viewer', 'igtvcount', 'blocked_by_viewer',
    'follows_viewer', 'has_blocked_viewer', 'has_requested_viewer', 'external_url',
    'is_verified', 'requested_by_viewer', 'profile_pic_url', 'similar_accounts', 'business_category_name', 'biography', 'full_name'
],
        axis=1)

In [None]:
dfi2.head()

#### Load preprocess

In [None]:
preprocess = joblib.load('../../resources/preprocess.joblib')

### Online versus batch prediction

The needs of your application dictate the type of prediction you should use.

You should generally use online prediction (sometimes called HTTP prediction) when you are making requests in response to application input or in other situations where timely inference is needed.

Batch prediction is ideal for processing accumulated data when you don't need immediate results. For example a periodic job that gets predictions for all data collected since the last job.

In both cases, you pass input data to a cloud-hosted machine-learning model and get inferences for each data instance. The differences are shown as follows:

#### Online prediction	
* Optimized to minimize the latency of serving predictions.	
* Can process one or more instances per request.	
* Predictions returned in the response message.	
* Input data passed directly as a JSON string.	
* Returns as soon as possible.	


#### Batch prediction

* Optimized to handle a high volume of instances in a job and to run more complex models.
* Can process one or more instances per request.
* Predictions written to output files in a Cloud Storage location that you specify.
* Input data passed indirectly as one or more URIs of files in Cloud Storage locations.
* Asynchronous request.

#### Online Prediction

In [None]:
inputs=[300, 1200, 500, 0, 0, 1]
keys=['mediacount', 'followers', 'followees', 'is_private', 'is_business_account', 'has_public_story']
dict6=dict(zip(keys, inputs))
test=pd.DataFrame([dict6])

In [None]:
test

In [None]:
test=test[['is_private', 'mediacount', 'followers', 'followees', 'is_business_account', 'has_public_story']]
test_array = preprocess.transform(test)

In [None]:
# unpickle the final model
file = open('../../resources/final_model.pkl', 'rb')
model=pickle.load(file)
file.close()

In [None]:
model.predict_proba(test_array)

In [None]:
model.predict(test_array)

#### Prediction for 1 profile to become a follower

### Batch Predictions

Batch prediction latency
If you use a simple model and a small set of input instances, you'll find that there is a considerable difference between how long it takes to finish identical prediction requests using online versus batch prediction. 

It might take a batch job several minutes to complete predictions that are returned almost instantly by an online request. This is a side-effect of the different infrastructure used by the two methods of prediction. 

Sagemaker allocates and initializes resources for a batch prediction job when you send the request. Online prediction is typically ready to process at the time of request.

In [None]:
Xi = preprocess.transform(dfi2)

In [None]:
#Predictions
predictions_batch=model.predict(Xi)
probabilities_batch = model.predict_proba(Xi)[:,1]

In [None]:
#probabilities_batch

### List of profiles to follow

In [None]:
contact = pd.DataFrame(model.predict(Xi))

In [None]:
predicted_profiles = pd.concat([df_prediction['username'], contact], axis = 1)

In [None]:
follow_profiles = predicted_profiles[predicted_profiles[0] == 1]

In [None]:
follow_profiles

### Data for Model Retraining

Model retraining should not result in a different model generating process. Rather retraining simply refers to re-running the process that generated the previously selected model on a new training set of data. The features, model algorithm, and hyperparameter search space should all remain the same. One way to think about this is that retraining doesn’t involve any code changes. It only involves changing the training data set.

Rather than deploying a model once and moving on to another project, machine learning practitioners need to retrain their models if they find that the data distributions have deviated significantly from those of the original training set. 

While the frequency of retraining will vary from problem-to-problem, ML engineers can start with a simple strategy that retrains models on a periodic basis as new data arrives and evolve to more complex processes that quantify and react to model drift.

This concept, known as model drift, can be mitigated but involves additional overhead in the forms of monitoring infrastructure, oversight, and process.

Monitor model — Test your deployment to ensure that your model is still performing as expected on test data with respect to your evaluation metrics and things like inference speed.

Evaluate new data — Using a model in production means you will frequently pass brand new data through the model that it has never been tested on. It’s important to perform evaluation and dig into specific samples to see how your model performs on any new data it encounters.

Continue understanding model — Some errors and biases in your model can be deep-seated and take a long time to uncover. You need to continuously test and probe your model for various edge cases and trends that could cause problems if they were to be discovered by clients instead.

Expand capabilities — Even if everything is working perfectly, it’s possible that the model isn’t increasing profits as much as you hoped. From adding new classes, developing new data streams, and making the model more efficient there are countless ways to expand the capabilities of your current model to make it even better. Any time you want to improve your system, you will need to restart the ML lifecycle to update your data, model, and evaluate it all to make sure your new features work as expected.

[link](https://mlinproduction.com/model-retraining/#:~:text=Rather%20than%20deploying%20a%20model,of%20the%20original%20training%20set.)

In [None]:
retrain_data = df_prediction[df_prediction['username'].isin(follow_profiles['username'])]

In [None]:
file_time = datetime.now().strftime("%Y-%m-%d_%I-%M-%S_%p")
# file name
ext =".csv"
profile_name_path = str(filenames.profile_path) + file_time + ext

In [None]:
retrain_data.to_csv(profile_name_path, index = False)