<div style="font-variant: small-caps; 
  font-weight: normal; 
  font-size: 37px; 
  text-align: center; 
  padding: 15px; 
  margin: 10px;
  color: #ff69b4;">
  <span style="color: #ff69b4;">Machine Learning Project</span>
  </div> 

<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  -<br>
  </div> 
  
<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px;
      color: #ff69b4;
      margin: 10px;">
      Selection of the best model <br><br>
  </div> 


<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 20px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      MC 
  </div>

## Table of Contents
1. [Final format of df](#final-format-of-df)
2. [Settings](#settings)
3. [RandomForestClassifier](#randomforestclassifier)
4. [CalibratedClassifierCV](#calibratedclassifiercv)
5. [DecisionTreeClassifier](#decisiontreeclassifier)
6. [Calibrated_tree_classifier](#calibrated_tree_classifier)
7. [Logistic Regression](#logistic-regression)
8. [Calibred Logistic Regression](#calibred-logistic-regression)



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV


from maincode.custom_functions import *
from eclyon.transforms import process_df, split_vals

import math


In [2]:
def metrics_result(y_true, y_pred):
    acc = np.mean(y_true == y_pred)
    rec = np.sum((y_true == 1) & (y_pred == 1)) / np.sum(y_true == 1)
    prc = np.sum((y_true == 1) & (y_pred == 1)) / np.sum(y_pred == 1)
    spe = np.sum((y_true == 0) & (y_pred == 0)) / np.sum(y_true == 0)
    f1 = 2 * (prc * rec) / (prc + rec)

    print('Accuracy : {:.2f}%'.format(acc * 100))
    print('Recall : {:.2f}%'.format(rec * 100))
    print('Precision : {:.2f}%'.format(prc * 100))
    print('Specificity : {:.2f}%'.format(spe * 100))
    print('F1-score : {:.2f}%'.format(f1 * 100))

def precision_recall_curve_with_threshold(targets, probas, threshold = 0.5):
    
    def sort_pair(X, Y):
        rangee = [(x, y) for x, y in sorted(zip(X, Y), key = lambda pair: pair[0])]
        return [pair[0] for pair in rangee], [pair[1] for pair in rangee]
    
    def precision_recall_curve(targets, probas):
        thresholds = sorted(set(probas), reverse=True)
        precision = []
        recall = []
        for t in thresholds:
            tp = sum((p >= t) and (tgt == 1) for p, tgt in zip(probas, targets))
            fp = sum((p >= t) and (tgt == 0) for p, tgt in zip(probas, targets))
            fn = sum((p < t) and (tgt == 1) for p, tgt in zip(probas, targets))
            precision.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
            recall.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
        return precision, recall, thresholds
    
    def auc(x, y):
        return sum((x[i] - x[i-1]) * (y[i] + y[i-1]) / 2 for i in range(1, len(x)))
    
    # --- main
    targets, probas = sort_pair(targets, probas)
    precision, recall, thresholds = precision_recall_curve(targets, probas)
    
    # calculate precision-recall AUC
    auc_val = auc(recall, precision)
    print('Area Under Curve (AUC) : {:.3f}'.format(auc_val))
    # plot no skill
    if threshold is not None : 
        index = len([y for y in thresholds if y <= threshold]) # thresholds are sorted

        print('Recall with threshold = {}: {:.2f}%'.format(threshold, recall[index]*100))
        print('Precision with threshold = {} : {:.2f}%'.format(threshold, precision[index]*100))

        plt.plot([0, 1], [precision[index], precision[index]], linestyle='--', color = 'grey')
        plt.plot([recall[index], recall[index]], [0, 1], linestyle='--', color = 'grey')
        
        # plot the precision-recall curve for the model
        plt.plot(recall[:index], precision[:index], color = 'navy', marker='.')
        plt.plot(recall[index:], precision[index:], color = 'orange', marker='.')
        
        # show the plot
        plt.title('Precision-Recall curve with threshold = {}'.format(threshold))
        plt.xlabel('Recall $P(\, \widehat{1}\, | 1)$')
        plt.ylabel('Precision $P(\, 1\, | \widehat{1})$                                ', rotation = 'horizontal')
    else :
        # plot the precision-recall curve for the model
        plt.plot(recall, precision, color = 'orange', marker='.')
        
        # show the plot
        plt.title('Precision-Recall curve')
        plt.xlabel('Recall $P(\, \widehat{1}\, | 1)$')
        plt.ylabel('Precision $P(\, 1\, | \widehat{1})$                                ', rotation = 'horizontal')

    plt.show()
    return

def rmse(y_gold, y_train): 
    return math.sqrt(((y_gold - y_train)**2).mean())


def print_score(m, X_train, y_train, X_valid, y_valid):
    print('RMSE on train set: {:.4f}'.format(rmse(m.predict(X_train), y_train)))
    print('RMSE on valid set: {:.4f}'.format(rmse(m.predict(X_valid), y_valid)))
    print('R^2 on train set: {:.4f}'.format(m.score(X_train, y_train)))
    print('R^2 on valid set: {:.4f}'.format(m.score(X_valid, y_valid)))
    if hasattr(m, 'oob_score_'): print('R^2 on oob set: {:.4f}'.format(m.oob_score_))
    return


def specificity_score(y_valid, y_train):
    y_predict_0  = [(pred, true) for pred, true in zip(y_train, y_valid) if true == 0]
    y_predict_00 = [(pred, true) for pred, true in y_predict_0 if pred == 0]
    specificity  = (0 if len(y_predict_0) == 0 else len(y_predict_00)/len(y_predict_0))
    return specificity

def metrics_results(y_true, y_predict_valid):
    acc = metrics.accuracy_score(y_true, y_predict_valid)
    rec = metrics.recall_score(y_true, y_predict_valid)
    prc = metrics.precision_score(y_true, y_predict_valid)
    spe = specificity_score(y_true, y_predict_valid)
    f1  = metrics.f1_score(y_true, y_predict_valid)

    print('Accuracy : {:.2f}%'.format(acc*100))
    print('Recall : {:.2f}%'.format(rec*100))
    print('Precision : {:.2f}%'.format(prc*100))
    print('Specificity : {:.2f}%'.format(spe*100))
    print('F1-score : {:.2f}%'.format(f1*100))

    


# Final format of df 

In [3]:
df_clean = pd.read_csv('c:/Users/Melissa/Bureau/Bsc 2/ML/BITCOIN/ML-project/data/processed/clean_df.csv')
df_selected_17 = pd.read_csv('c:/Users/Melissa/Bureau/Bsc 2/ML/BITCOIN/ML-project/data/processed/selected_columns_17.csv')


In [4]:
df = pd.merge(df_selected_17, df_clean[['close']], left_index=True, right_index=True)



In [5]:
# Calculate relative evolution
df["Relative_Evolution"] = df["Evolution"] / df["close"]

# Calculate the moving average of the relative evolution
df['R_E_5'] = df['Relative_Evolution'].rolling(window=5, min_periods=1).mean()

# Transform R_E_5 in binary 
q25 = df["R_E_5"].quantile(0.25)
q75 = df["R_E_5"].quantile(0.75)

df["B-Evolution"] = np.where(df["R_E_5"] < q25, 0, np.where(df["R_E_5"] > q75, 2, 1))


df = df.drop(columns=['close'])

df["B-Evolution"]


0         0
1         0
2         1
3         0
4         1
         ..
416620    1
416621    1
416622    1
416623    0
416624    0
Name: B-Evolution, Length: 416625, dtype: int64

# Settings

In [6]:
df, y, nas = process_df(df, 'B-Evolution')

In [7]:
n_total = len(df)
# Calculate the number of validation samples
n_valid = int(n_total * 0.1)

# Calculate the number of training samples
n_train = n_total - n_valid

# Print the total number of samples, training samples, and validation samples
print(n_total, n_train, n_valid)

# Split the DataFrame into training and validation sets
X_train, X_valid = split_vals(df, n_train)
y_train, y_valid = split_vals(y, n_train)

# Select the first 10,000 samples from the training set for X and y
_, X_small = split_vals(X_train, n_train-10000)
_, y_small = split_vals(y_train, n_train-10000)


416625 374963 41662


# RandomForestClassifier

In [26]:
classifier = RandomForestClassifier(
    n_estimators = 20, 
    class_weight = None, # classifier specific
    criterion = 'gini',  # classifier specific
    max_depth = 5, 
    min_samples_split = 2, 
    min_samples_leaf = 1, 
    min_weight_fraction_leaf = 0.0, 
    max_features = None, 
    max_leaf_nodes = None, 
    min_impurity_decrease = 0.0, 
    ccp_alpha = 0.0, 
    random_state = 42, 
    bootstrap = True, 
    oob_score = True, 
    max_samples = None,
    warm_start = False, 
    n_jobs = -1, 
    verbose = 0, 
)

In [28]:
classifier.fit(X_train, y_train)
print(f"the precision on the traning set is {classifier.score(X_train, y_train)}")
print(f"the precision on the valid set is {classifier.score(X_valid, y_valid)}")

  warn(


the precision on the traning set is 0.779738267509061
the precision on the valid set is 0.834933512553406


In [29]:
y_predict_valid = classifier.predict(X_valid)


RFC = metrics_result(y_valid,y_predict_valid) 

Accuracy : 83.49%
Recall : 100.00%
Precision : 80.00%
Specificity : 51.98%
F1-score : 88.89%


# Calibrated classifier

In [30]:
calibrated_classifier = CalibratedClassifierCV(
    estimator = classifier,
    method = 'sigmoid', # 'sigmoid' or 'isotonic'
    ensemble = True,
    cv = 5,
    n_jobs = -1,
)

In [31]:
calibrated_classifier.fit(X_train, y_train)

y_predict_valid = calibrated_classifier.predict(X_valid)
metrics_result(y_valid, y_predict_valid)

Accuracy : 94.18%
Recall : 100.00%
Precision : 91.90%
Specificity : 83.60%
F1-score : 95.78%


In [32]:
# Use cross-validation to check if calibrated_classifier is overfitting
cv_scores = cross_val_score(calibrated_classifier, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.9932527  0.97557105 0.9610897  0.99203915 0.99199915]
Mean cross-validation score: 0.982790351201572


# DecisionTreeClassifier

In [13]:
tree_classifier = DecisionTreeClassifier(
    class_weight = None, # specific to classifiers
    criterion = 'gini',  # specific to classifiers
    splitter = 'best', 
    max_depth = 5, 
    min_samples_split = 2, 
    min_samples_leaf = 1, 
    min_weight_fraction_leaf = 0.0, 
    max_features = None, 
    max_leaf_nodes = None, 
    min_impurity_decrease = 0.0, 
    ccp_alpha = 0.0,
    random_state = None, 
)



In [14]:
tree_classifier.fit(X_train, y_train)

y_predict_valid = tree_classifier.predict(X_valid)
metrics_result(y_valid, y_predict_valid)

Accuracy : 75.86%
Recall : 99.42%
Precision : 73.42%
Specificity : 32.53%
F1-score : 84.47%


## Calibrated_tree_classifier

In [15]:
calibrated_tree_classifier = CalibratedClassifierCV(
    estimator = tree_classifier,
    method = 'sigmoid', # 'sigmoid' or 'isotonic'
    ensemble = True,
    cv = 5,
    n_jobs = -1,
)

In [16]:
calibrated_tree_classifier.fit(X_train, y_train)

y_predict_valid = calibrated_classifier.predict(X_valid)
metrics_result(y_valid, y_predict_valid)

Accuracy : 74.39%
Recall : 100.00%
Precision : 72.05%
Specificity : 23.31%
F1-score : 83.76%


# Logistic Regression

In [22]:

LR_model  = LogisticRegression(
    random_state=20,
    max_iter=100
)

LR_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
y_predict_valid = LR_model.predict(X_valid)
LR = metrics_result(y_valid, y_predict_valid)

Accuracy : 66.88%
Recall : 99.87%
Precision : 66.75%
Specificity : 3.25%
F1-score : 80.02%


# Calibred Logistic Regression


In [24]:
calibrated_LR = CalibratedClassifierCV(
    estimator = LR_model,
    method = 'sigmoid', # 'sigmoid' or 'isotonic'
    ensemble = True,
    cv = 5,
    n_jobs = -1,
)

In [25]:
calibrated_tree_classifier.fit(X_train, y_train)

y_predict_valid = calibrated_classifier.predict(X_valid)
metrics_result(y_valid, y_predict_valid)

Accuracy : 74.39%
Recall : 100.00%
Precision : 72.05%
Specificity : 23.31%
F1-score : 83.76%


In [21]:
import os

# Create the directory if it does not exist
#os.makedirs('c:/Users/Melissa/Bureau/Bsc 2/ML/BITCOIN/ML-project/models/', exist_ok=True)

#joblib.dump(RF_Model, 'c:/Users/Melissa/Bureau/Bsc 2/ML/BITCOIN/ML-project/models/RF_Model.pkl')