In [32]:
import numpy as np
import pandas as pd

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [36]:

import mlflow
mlflow.set_tracking_uri('http://ec2-51-20-64-139.eu-north-1.compute.amazonaws.com:5000')

In [None]:

mlflow.set_experiment('exp 2 - BoW vs Tfidf')

In [27]:
def run_experiment(X, y, vectorizer_name, ngram_range):
    max_features = 5000
    if vectorizer_name == 'BoW':
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # transform data
    X_train_trf = vectorizer.fit_transform(X_train)
    X_test_trf = vectorizer.transform(X_test)
    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
        mlflow.set_tag("experiment_type", "feature engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")
        mlflow.set_tag("discription", f"RandomForest with {vectorizer_name}, ngram_range={ngram_range} and max_features={max_features}")

        mlflow.log_param("vectorizer_type", vectorizer_name)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)
        
        # Fit the model
        n_estimators = 200
        max_depth = 15
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train_trf, y_train)

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        y_pred = model.predict(X_test_trf)
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, output_dict=True)
        conf_matrix = confusion_matrix(y_test, y_pred)

        mlflow.log_metric("accuracy", accuracy)
        for label, metrics in class_report.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        plt.figure(figsize=(8,6))
        sns.heatmap(conf_matrix, annot=True, cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel('Actual')
        plt.title(f"Confusion Matrix {vectorizer_name}, {ngram_range}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        mlflow.sklearn.log_model(model, f"random_forest_{vectorizer_name}_{ngram_range}")

In [28]:
df = pd.read_csv('dataset.csv')
print(df.shape)
df.dropna(inplace=True)
print(df.shape)
X = df['clean_comment']
y = df['category']
ngram_ranges = [(1,1), (1,2), (1,3)]
for ngram_range in ngram_ranges:
    run_experiment(df['clean_comment'], df['category'], "BoW", ngram_range)
    run_experiment(df['clean_comment'], df['category'], "Tf-idf", ngram_range)

(36793, 2)
(36662, 2)




🏃 View run BoW_(1, 1)_RandomForest at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812/runs/f6eb7fdc9b7c41a78810786ebb17feae
🧪 View experiment at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812




🏃 View run Tf-idf_(1, 1)_RandomForest at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812/runs/b16a8db14bac430a91d12934d6dc5935
🧪 View experiment at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812




🏃 View run BoW_(1, 2)_RandomForest at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812/runs/9269c1014cc74b28ab87582bf88cbe00
🧪 View experiment at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812




🏃 View run Tf-idf_(1, 2)_RandomForest at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812/runs/172596a3c0d84af6998f68e5ac89b40c
🧪 View experiment at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812




🏃 View run BoW_(1, 3)_RandomForest at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812/runs/6d4a3eda9c18422985346891e9e6777f
🧪 View experiment at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812




🏃 View run Tf-idf_(1, 3)_RandomForest at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812/runs/e90dad80fcb84e939cd6b528268ebab7
🧪 View experiment at: http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/#/experiments/394738919125695812
