### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import numpy as np
import pandas as pd
import json
import codecs
import datetime
import os
import sys
sys.path.append('../')

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (24, 16)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [32]:
def csv_to_df(path):
    x = pd.read_csv(path)
    x.rename(columns={'Unnamed: 0': 'description'}, inplace=True)
    x = x.set_index('description')
    return x
X = csv_to_df('../csv_files/ModelFiles/photogallery_installedapss_with_cluster_labels_for_ml_model.csv')

In [33]:
X.head(5)

Unnamed: 0_level_0,median number photos taken in day - location,mad number photos taken in day - scale,mean number photos taken in day,sample non zero days ratio,mean number of photos in burst,mean number of bursts a week,median night time number photos - location,mad night time number photos - scale,mean night time number photos,sampled non zero nights ratio,...,ratio of weekend and full week mean photos in burst,ratio of weekend and full week bursts a week,ar of num of photos in day time lag-1,entropy on number of photos taken in day,mean max ratio of category of installed apps,median max ratio of category of installed apps,app category coverage ratio,entropy of app categories,entropy of install app time of day,labels
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
005b6c49-c4ed-438b-9092-ad02ed4d672e,2.0,1.482602,5.371968,0.087831,1.300914,0.845481,2.0,1.482602,5.2,0.068655,...,1.001396,0.427252,17.050272,7.228774,0.416667,0.333333,0.214286,3.180833,2.689246,-1
005d6e1d-7a19-422f-a71a-1555b6ca5724,1.0,0.0,1.5625,0.101106,1.111111,0.25,1.0,0.0,1.390244,0.065916,...,0.99,8.0,0.503928,5.536507,0.466667,0.333333,0.267857,3.419382,1.976648,-1
007a0785-1321-4112-be89-e6549a4725a2,3.0,2.965204,26.128205,0.040902,2.791781,1.641026,3.0,2.965204,9.958333,0.026287,...,0.958653,0.377574,34.620526,4.114481,0.666667,0.5,0.214286,3.235926,2.358459,-1
008e4a51-2ded-4a7f-a822-e1a4756d7ebd,2.0,1.482602,2.25,0.153191,1.285714,0.485714,2.0,1.482602,2.157895,0.080851,...,0.777778,-2.058824,2.886932,4.880111,0.261905,0.125,0.375,3.875725,2.623889,0
00c86b27-f076-4560-bb3d-2fb0364b23a1,2.0,1.482602,2.625,0.096386,1.363636,0.283333,2.0,1.482602,2.461538,0.066667,...,0.782222,3.529412,5.491979,4.94017,0.336842,0.2,0.339286,3.783465,2.475188,-1


In [34]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [35]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [36]:
feature_lst = list(X.columns)
lbls = np.array(X['labels'])

In [37]:
data = X.drop('labels', axis = 1)

In [38]:
dat = np.array(data)

In [39]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dat, lbls, test_size = 0.25, random_state = 42)

In [43]:
X_train.shape

(1499, 47)

In [52]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    #errors = abs(predictions - y_test)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    #class_report = classification_report(y_test, predictions)
    #print('Model Performance')
    #print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    #print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy, conf_matrix #, class_report

In [45]:
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 21.5min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [50]:
best_random = rf_random.best_estimator_
best_random

RandomForestClassifier(bootstrap=False, max_features='sqrt', n_estimators=400)

In [54]:
random_accuracy, confusion_mat = evaluate(best_random, X_test, y_test)
print('Accuracy = {:0.2f}%.'.format(random_accuracy))

Accuracy = 0.91%.


In [55]:
'''
confusion matrix :
c[0,0] c[0,1]
c[1,0] c[1,1]

C[0,0] - true negative 
C[1,0] - false negative
C[0,1] - false positive
C[1,1] - true positive 
'''
print(confusion_mat)

[[144   6   4   0   0]
 [  2 123   5   0   0]
 [ 16   4 108   0   0]
 [  0   0   2  56   0]
 [  0   2   2   0  26]]


# prcision = correct_identify / total_identify    
      example for positive is :   true_positive / (true_positive + true_negative)
# recall = correct_identify / total_class_member
      example for positive is :   true_positive / (true_positive + false_negative)
# f1-score = good classifier estimation

In [29]:
print(classification_report)

              precision    recall  f1-score   support

         0.0       0.66      0.97      0.78       314
         1.0       0.47      0.05      0.09       167

    accuracy                           0.65       481
   macro avg       0.56      0.51      0.44       481
weighted avg       0.59      0.65      0.54       481



In [56]:
def feature_importence(rf, feature_list):
    # Get numerical feature importances
    importances = list(rf.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [57]:
feature_importence(best_random, feature_lst)

Variable: ar of num of photos in day time lag-1 Importance: 0.15
Variable: mean number photos taken in day Importance: 0.11
Variable: mean work days number photos Importance: 0.06
Variable: median number photos taken in day - location Importance: 0.03
Variable: sample non zero days ratio Importance: 0.03
Variable: mean number of bursts a week Importance: 0.03
Variable: mean day time number photos Importance: 0.03
Variable: mean weekend number photos Importance: 0.03
Variable: mad number photos taken in day - scale Importance: 0.02
Variable: mean number of photos in burst Importance: 0.02
Variable: mean night time number photos Importance: 0.02
Variable: median work days number photos - location Importance: 0.02
Variable: sampled weekdays ratio Importance: 0.02
Variable: mean number of bursts in weekdays Importance: 0.02
Variable: ratio of week work days and week daily mean number of photos Importance: 0.02
Variable: ratio of weekend and full week bursts a week Importance: 0.02
Variable

In [58]:
Y = csv_to_df('../csv_files/ModelFiles/photogallery_installedapss_with_target_for_ml_model.csv')

In [63]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [60]:
Y.columns

Index(['median number photos taken in day - location',
       'mad number photos taken in day - scale',
       'mean number photos taken in day', 'sample non zero days ratio',
       'mean number of photos in burst', 'mean number of bursts a week',
       'median night time number photos - location',
       'mad night time number photos - scale', 'mean night time number photos',
       'sampled non zero nights ratio',
       'mean number of photos in burst at night',
       'mean number of bursts a week nights',
       'median day time number photos - location',
       'mad day time number photos - scale', 'mean day time number photos',
       'sampled non zero days ratio',
       'mean number of photos in burst at daytime',
       'mean number of bursts a week daytime',
       'median weekend number photos - location',
       'mad weekend number photos - scale', 'mean weekend number photos',
       'sampled non zero weekends ratio',
       'mean number of photos in burst at weekend',


In [62]:
target = np.array(Y['User_Register'])
data = Y.drop('User_Register', axis = 1)
dat = np.array(data)

In [64]:
scaler = MinMaxScaler()
S = scaler.fit_transform(dat)
X_tr, X_te, y_tr, y_te = train_test_split(S, target, test_size = 0.2, random_state = 42)

In [66]:
X_te.shape

(385, 47)

In [67]:
logmodel = LogisticRegression(max_iter=10000)
logmodel.fit(X_tr, y_tr)
pred = logmodel.predict(X_te)

In [70]:
pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.