In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
import os

In [18]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [19]:
import re

def lower_case(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)


def replace_pattern(pattern, text):
    text = re.sub(pattern, ' ', text)
    return text


def replace_pattern_with_pattern(pattern1, pattern2, text):
    text = re.sub(pattern1, pattern2, text)
    return text

In [20]:
df['content'] = df['content'].apply(lambda x: lower_case(x))
url_pattern = r'https?://[^\s]*|www\.[^\s]*'
user_pattern = r'@[^\s]*'
df['content'] = df['content'].apply(lambda x: replace_pattern(url_pattern, x))
df['content'] = df['content'].apply(lambda x: replace_pattern(user_pattern, x))
non_alphanumeric_pattern = r'[^a-zA-Z0-9]'
df['content'] = df['content'].apply(lambda x: replace_pattern(non_alphanumeric_pattern, x))
search_pattern = r'(.)\1{2,}'
replace_with = r'\1\1'
df['content'] = df['content'].apply(lambda x: replace_pattern_with_pattern(search_pattern, replace_with, x))

In [21]:
df.sample(10)

Unnamed: 0,sentiment,content
13716,enthusiasm,i want one so bad get one for me
4042,love,i hope you come over today its almost 8 ill b...
33957,love,happy birthday justin have a lof of fun go...
9962,love,thats what i love about you you are as sham...
30397,surprise,christa s mothers day card deep in the build ...
16865,happiness,good job i have the track in my head cos sa...
8819,love,i wanna go i wanna go but i can t
27701,worry,yes i know as my school reports would say m...
14791,worry,plus it hurts seeing sum1 you love falling for...
30264,happiness,thank you yes yes hooray


In [22]:

df = df[df['sentiment'].isin(['happiness', 'sadness', 'love'])]

In [23]:
df['sentiment'] = df['sentiment'].map({'happiness':0, 'love':0, 'sadness':1})

In [24]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = [word for word in text.split() if word not in stopword]
    return " ".join(new_text)

df['content'] = df['content'].apply(lambda x: remove_stopwords(x))

In [25]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# apply lemmatization
def apply_lemmatization(text):
    new_text = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(new_text)

df['content'] = df['content'].apply(lambda x: apply_lemmatization(x))

In [26]:
df.head()

Unnamed: 0,sentiment,content
1,1,layin n bed headache ughh waitin call
2,1,funeral ceremony gloomy friday
6,1,sleep im thinking old friend want married damn...
8,1,charlene love miss
9,1,sorry least friday


In [27]:
import mlflow
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/neerajstd159/mlflow-demo.mlflow')
dagshub.init(repo_owner="neerajstd159", repo_name="mlflow-demo", mlflow=True)
mlflow.set_experiment("Random forest with bow")

<Experiment: artifact_location='mlflow-artifacts:/b5f3034bb8a64980b4d6fb7f5622f3d3', creation_time=1734962267570, experiment_id='5', last_update_time=1734962267570, lifecycle_stage='active', name='Random forest with bow', tags={}>

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ("BoW", CountVectorizer()),
    ("rf", RandomForestClassifier())
])

param_grid = {
    "BoW__max_features": [1000, 2000, 5000],
    "BoW__ngram_range": [(1, 1), (1, 2)],
    "rf__n_estimators": [50, 100, 200],
    "rf__max_depth": [None, 10, 20],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
}


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
# Split data
X_train_full, X_test, y_train_full, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment'])
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)

with mlflow.start_run() as parent_run:
    grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Log each configuration and its performance
    for i, params in enumerate(grid_search.cv_results_['params']):
        with mlflow.start_run(run_name=f"Param_Set_{i+1}", nested=True) as child_run:
            mlflow.log_params(params)
            mean_test_score = grid_search.cv_results_['mean_test_score'][i]
            mlflow.log_metric("mean_cv_accuracy", mean_test_score)
    
    # Get best model and parameters
    best_estimator = grid_search.best_estimator_
    best_params = grid_search.best_params_
    mlflow.log_params(best_params)

    # Validate on test set
    y_pred = best_estimator.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1_scr = f1_score(y_test, y_pred, average='weighted')

    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1_scr)

    # Log confusion matrix as an artifact
    cm_display = ConfusionMatrixDisplay.from_estimator(best_estimator, X_test, y_test, cmap=plt.cm.Blues)
    cm_figure_path = "confusion_matrix.png"
    cm_display.figure_.savefig(cm_figure_path)
    mlflow.log_artifact(cm_figure_path)

    # Save the notebook and log it
    notebook_path = "selected_model_hp.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)

    signature = mlflow.models.infer_signature(X_val, best_estimator.predict(X_val))
    mlflow.sklearn.log_model(best_estimator, "best_model", signature=signature)

    # Print the results for verification
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_scr:.4f}")


Fitting 5 folds for each of 486 candidates, totalling 2430 fits
🏃 View run Param_Set_1 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5/runs/d66f2248c75d43c39cf8dc1b7bba0168
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5
🏃 View run Param_Set_2 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5/runs/4e7fdcb90a3e4eefa1bd1a458044f1c2
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5
🏃 View run Param_Set_3 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5/runs/8f55168de29b49aca3336f31252c73ac
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5
🏃 View run Param_Set_4 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5/runs/942db5b3ad4d4ec0ab4497b36f57df80
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/5
🏃 View run Param_Set_5 at: https://dagshub.c

KeyboardInterrupt: 

In [None]:
# with mlflow.start_run() as parent_run:
#     grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
#     grid_search.fit(X_train, y_train)

#     for i, params in enumerate(grid_search.cv_results_['params']):
#         with mlflow.start_run(run_name=f"{i+1}", nested=True) as child_run:

#             mlflow.log_params(params)
#             mean_test_score = grid_search.cv_results_['mean_test_score'][i]
#             mlflow.log_metric("mean_accuracy", mean_test_score)

#     best_estimator = grid_search.best_estimator_
#     best_params = grid_search.best_params_
#     y_pred = best_estimator.predict(X_test)

#     accuracy = accuracy_score(y_pred, y_test)
#     precision = precision_score(y_pred, y_test)
#     f1_scr = f1_score(y_pred, y_test)
#     recall = recall_score(y_pred, y_test)

#     mlflow.log_params(best_params)
#     mlflow.log_metric("accuracy", accuracy)
#     mlflow.log_metric("precision", precision)
#     mlflow.log_metric("recall", recall)
#     mlflow.log_metric("f1_score", f1_scr)

#     signature = mlflow.models.infer_signature(X_train, best_estimator.predict(X_test))
#     mlflow.sklearn.log_model(best_estimator, "best_model", signature=signature)
#     # save and log file
#     notebook_path = "lr_tfidf_hp.ipynb"
#     os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
#     mlflow.log_artifact(notebook_path)

#     # Print the results for verification
#     print(f"Accuracy: {accuracy}")
#     print(f"Precision: {precision}")
#     print(f"Recall: {recall}")
#     print(f"F1 Score: {f1_scr}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
🏃 View run 1 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3/runs/a99c4e68334f4dc490379a2bb06b9c08
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3
🏃 View run 2 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3/runs/db5400c468a8424aadd4807a072787b9
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3
🏃 View run 3 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3/runs/77616e6d70d1470a98a9c40a50c7060f
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3
🏃 View run 4 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3/runs/40fe12758c9d423295a1a050b3cef7d9
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3
🏃 View run 5 at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/3/r