### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import numpy as np
import pandas as pd
import json
import codecs
import datetime
import os
import sys
sys.path.append('../')

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (24, 16)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [27]:
from sklearn.model_selection import train_test_split

In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
def csv_to_df(path):
    x = pd.read_csv(path)
    x.rename(columns={'Unnamed: 0': 'description'}, inplace=True)
    x = x.set_index('description')
    return x
X = csv_to_df('csv_files/ModelFiles/photogallery_with_tag_for_xgboost.csv')

In [8]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [20]:
feature_lst = list(X.columns)
lbls = np.array(X['User_Register'])

In [21]:
data = X.drop('User_Register', axis = 1)

In [23]:
dat = np.array(data)

In [28]:
# Split the data into training and testing sets
train_data, test_data, train_lbls, test_lbls = train_test_split(dat, lbls, test_size = 0.25, random_state = 42)

In [43]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    #errors = abs(predictions - y_test)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    class_report = classification_report(y_test, predictions)
    #print('Model Performance')
    #print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    #print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy, conf_matrix, class_report

In [36]:
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_data, train_lbls)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 24.4min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [37]:
best_random = rf_random.best_estimator_
best_random

RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                       n_estimators=200)

In [44]:
best_random = rf_random.best_estimator_
random_accuracy, confusion_mat, classification_report = evaluate(best_random, test_data, test_lbls)
print('Accuracy = {:0.2f}%.'.format(random_accuracy))

Accuracy = 0.63%.


In [47]:
'''
C[0,0] - true negative (169)
C[1,0] - false negative (90)
C[0,1] - false positive (12)
C[1,1] - true positive (8)
'''
print(confusion_mat)

[[169  12]
 [ 90   8]]


# prcision = correct_identify / total_identify    
      example for positive is :   true_positive / (true_positive + true_negative)
# recall = correct_identify / total_class_member
      example for positive is :   true_positive / (true_positive + false_negative)
# f1-score = good classifier estimation

In [48]:
print(classification_report)

              precision    recall  f1-score   support

         0.0       0.65      0.93      0.77       181
         1.0       0.40      0.08      0.14        98

    accuracy                           0.63       279
   macro avg       0.53      0.51      0.45       279
weighted avg       0.56      0.63      0.55       279



In [50]:
def feature_importence(rf, feature_list):
    # Get numerical feature importances
    importances = list(rf.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [None]:
rf_random.

In [54]:
feature_importence(best_random, feature_lst)

Variable: mean number of photos in burst at night Importance: 0.05
Variable: ar of num of photos in weekdays lag-1 Importance: 0.05
Variable: mean number of photos in burst Importance: 0.04
Variable: mean number of bursts a week Importance: 0.04
Variable: entropy on number of photos taken in night time Importance: 0.04
Variable: ar of num of photos in night time lag-1 Importance: 0.04
Variable: mean number of bursts a week nights Importance: 0.04
Variable: entropy on number of photos taken in day time Importance: 0.04
Variable: ar of num of photos in day time lag-1 Importance: 0.04
Variable: mean number of photos in burst at daytime Importance: 0.04
Variable: mean number of bursts a week daytime Importance: 0.04
Variable: sampled weekends ratio Importance: 0.04
Variable: entropy on number of photos taken in weekend Importance: 0.04
Variable: mean number of photos in burst at weekend Importance: 0.04
Variable: mean number of bursts in weekend Importance: 0.04
Variable: entropy on number