In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report, matthews_corrcoef)
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd

In [17]:
# Train and test the model
def train_model(X_train, y_train, X_test, y_test, feature_names, grid_search=False):
    # Initialize and fit the model
    start_model = RandomForestClassifier(
        verbose=1,
        n_jobs=-1,
        n_estimators=200,
        min_samples_leaf=2, 
        min_samples_split=10,
        max_depth=10,
        max_features=None,
        random_state=42,
        #class_weight='balanced'
    )

    if grid_search:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None],
        }

        # Searching for the best tree parameters
        grid_search = GridSearchCV(
            start_model,
            param_grid,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        print("best_params_ ----> Random Forest:", best_params)
        print("best_rf_model: ", best_model)
    else:
        best_model = start_model
        best_model.fit(X_train, y_train)

    # Test predictions
    y_pred = best_model.predict(X_test)

    # Evaluate model
    metrics_rf = calculate_performance_metrics(y_test, y_pred)
    print_performance_metrics(metrics_rf)
    feature_importance(best_model, X_train, feature_names)

    return best_model

# Useful values for classification
def calculate_performance_metrics(y_test, y_pred):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred, average='weighted')
    metrics['recall'] = recall_score(y_test, y_pred, average='weighted')
    metrics['f1_score'] = f1_score(y_test, y_pred, average='weighted')
    metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
    metrics['mcc'] = matthews_corrcoef(y_test, y_pred)
    metrics['classification_report'] = classification_report(y_test, y_pred)
    
    return metrics

# Prints all performance metrics
def print_performance_metrics(metrics):
    print("Accuracy:", metrics.get('accuracy', "Not computed"))
    print("Precision:", metrics.get('precision', "Not computed"))
    print("Recall:", metrics.get('recall', "Not computed"))
    print("F1 Score:", metrics.get('f1_score', "Not computed"))
    print("Confusion Matrix:\n", metrics.get('confusion_matrix', "Not computed"))
    print("Matthews Correlation Coefficient (MCC):", metrics.get('mcc', "Not computed"))
    print("Classification Report:\n", metrics.get('classification_report', "Not computed"))

# Determine the feature importance in the model
def feature_importance(model, X, feature_names):
    feature_importances = model.feature_importances_
    feature_importances_list = [(feature_names[j], importance) for j, importance in enumerate(feature_importances)]
    feature_importances_list.sort(key=lambda x: x[1], reverse=True)

    print("Feature Importances:")
    for feature, importance in feature_importances_list:
        print(f"{feature}: {importance}")

In [3]:
# Trains the model
def create_model():
    # Load the data
    data_name = ''
    processed_data = pd.read_csv("./data/final.csv")
    processed_data = processed_data.drop(columns=["date"])


    # Separate data
    target_name = 'weather_code'
    X = processed_data.drop(columns=[target_name]).values
    y = processed_data[target_name].values

    # feature_names = processed_data.columns[:-1].tolist()
    feature_names = processed_data.drop(columns=[target_name]).columns.tolist()

    # Get test and train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print('Read the data')
    # Run the model
    model = train_model(X_train, y_train, X_test, y_test, feature_names)

    return model

In [18]:
# Runs the model
def run_model_training():
    # Train the model
    create_model()

run_model_training()

Read the data


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Accuracy: 0.9148936170212766
Precision: 0.9118436213832322
Recall: 0.9148936170212766
F1 Score: 0.9097684208195331
Confusion Matrix:
 [[ 34   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  38   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  34   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 240   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  44   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  26   5   0   0   1   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   3   9   1   0   1   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   1   0   5   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  14   2   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   7  14   1   0   0   0   0   1   0]
 [  0   0   0   0   0   0   0   0   0   0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
