In [83]:
import numpy as np
import pandas as pd
import mlflow
import dagshub

In [52]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [53]:
import re

def lower_case(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)


def replace_pattern(pattern, text):
    text = re.sub(pattern, ' ', text)
    return text


def replace_pattern_with_pattern(pattern1, pattern2, text):
    text = re.sub(pattern1, pattern2, text)
    return text

In [54]:
df['content'] = df['content'].apply(lambda x: lower_case(x))
url_pattern = r'https?://[^\s]*|www\.[^\s]*'
user_pattern = r'@[^\s]*'
df['content'] = df['content'].apply(lambda x: replace_pattern(url_pattern, x))
df['content'] = df['content'].apply(lambda x: replace_pattern(user_pattern, x))
non_alphanumeric_pattern = r'[^a-zA-Z0-9]'
df['content'] = df['content'].apply(lambda x: replace_pattern(non_alphanumeric_pattern, x))
search_pattern = r'(.)\1{2,}'
replace_with = r'\1\1'
df['content'] = df['content'].apply(lambda x: replace_pattern_with_pattern(search_pattern, replace_with, x))

In [61]:
df.sample(10)

Unnamed: 0,sentiment,content
12037,fun,well aren t you just absolutely special hah...
24405,neutral,playing see you again miley cyrus in the shop
9656,happiness,dad s posting got postponed one hour before he...
12977,happiness,take my quot how well do you know me quot ...
29969,worry,singing songs and filming movies what i do best
31399,happiness,butlers watching dr farmer rock out w the ...
2609,neutral,i saw the note in faq about other languages ...
31046,neutral,lt 3c b mothers day tomorrow
10880,neutral,does anyone know how to call an international ...
30639,neutral,has a little buzz from the epoxy paint


In [64]:
df = df[df['sentiment'].isin(['happiness', 'sadness', 'love'])]

In [65]:
df.sample(10)

Unnamed: 0,sentiment,content
35288,happiness,happy mothers day i hope tom got you someth...
18817,sadness,waiting for the denver game to come on but i ...
20998,love,and may the 4th be with you too amused lo...
38241,happiness,hey ive seen the musical live its funny lol
8020,love,eating some breakfast at panera bread boring ...
14394,sadness,i really cant take this
5246,happiness,ooh lovely a ridiculous amount of revision le...
9681,sadness,oh god i just watched it to did claire esca...
38934,love,your the voice i hear inside my head the reas...
22263,happiness,you re welcome and thanks for inviting us to...


In [70]:
df['sentiment'] = df['sentiment'].map({'happiness':0, 'love':0, 'sadness':1})

In [71]:
df.sample(10)

Unnamed: 0,sentiment,content
12144,1,gotta work lonely weekend ahead
32500,0,maybe he could drill a hole in me lol oh snap...
24578,0,i can remember those evidence based manageme...
15624,1,my day has been so crappy that i just want to ...
37643,0,love me love me say that you love me
4905,1,congrats to all graduates such a beautiful th...
21376,0,oh and happy judd day haha judday
39315,0,currently in costa coffee im liking this place...
23661,0,i finished all my work
19485,1,i m sad to take off the suit b c i know i m no...


In [76]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = [word for word in text.split() if word not in stopword]
    return " ".join(new_text)

df['content'] = df['content'].apply(lambda x: remove_stopwords(x))

In [77]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# apply lemmatization
def apply_lemmatization(text):
    new_text = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(new_text)

In [81]:
df['content'] = df['content'].apply(lambda x: apply_lemmatization(x))

In [82]:
df.head()

Unnamed: 0,sentiment,content
1,1,layin n bed headache ughh waitin call
2,1,funeral ceremony gloomy friday
6,1,sleep im thinking old friend want married damn...
8,1,charlene love miss
9,1,sorry least friday


In [84]:
mlflow.set_experiment("model selection")

2024/12/22 16:16:32 INFO mlflow.tracking.fluent: Experiment with name 'model selection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/data/projects/mlflow-basic-demo/notebooks/mlruns/475922320043714587', creation_time=1734864392257, experiment_id='475922320043714587', last_update_time=1734864392257, lifecycle_stage='active', name='model selection', tags={}>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from xgboost import XGBClassifier

vectorizer = {
    "BoW": CountVectorizer(),
    "tfidf": TfidfVectorizer()
}

algos = {
    "LogisticRegression" : LogisticRegression(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(max_depth=3),
    "RandomForestClassifier" : RandomForestClassifier(),
    "BernoulliNB" : BernoulliNB(),
    "MultinomialNB" : MultinomialNB(),
    "XGBClassifier" : XGBClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
}

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import os

In [None]:
with mlflow.start_run(run_name="All Experiments") as parent_run:
    for key, algo in algos.items():
        for vkey, vectr in vectorizer.items():
            with mlflow.start_run(run_name=f"{key} with {vkey}", nested=True) as child_run:
                X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2)

                # log params
                mlflow.log_param("vectorizer", vkey)
                mlflow.log_param("algorithm", key)
                mlflow.log_param("test_size", 0.2)
                
                X_train_trf = vectr.fit_transform(X_train)
                X_test_trf = vectr.transform(X_test)

                algo.fit(X_train_trf, y_train)
                y_pred = algo.predict(X_test_trf)

                # log model params
                # Log model parameters based on the classifier type
                if key == 'LogisticRegression':
                    mlflow.log_param("C", algo.C)
                elif key == 'DecisionTreeClassifier':
                    mlflow.log_param("max_depth", algo.max_depth)
                    mlflow.log_param("min_samples_split", algo.min_samples_split)
                elif key == 'RandomForestClassifier':
                    mlflow.log_param("n_estimators", algo.n_estimators)
                    mlflow.log_param("max_depth", algo.max_depth)
                elif key == 'BernoulliNB' or key == 'MultinomialNB':
                    mlflow.log_param("alpha", algo.alpha)
                elif key == 'XGBClassifier':
                    mlflow.log_param("learning_rate", algo.learning_rate)
                    mlflow.log_param("n_estimators", algo.n_estimators)
                elif key == 'GradientBoostingClassifier':
                    mlflow.log_param("learning_rate", algo.learning_rate)
                    mlflow.log_param("n_estimators", algo.n_estimators)


                accuracy = accuracy_score(y_pred, y_test)
                precision = precision_score(y_pred, y_test)
                f1_scr = f1_score(y_pred, y_test)
                recall = recall_score(y_pred, y_test)

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                mlflow.log_metric("f1_score", f1_scr)

                # log model
                mlflow.sklearn.log_model(algo, f"{key}")

                # save and log file
                notebook_path = "select_model.ipynb"
                os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
                mlflow.log_artifact(notebook_path)

                # Print the results for verification
                print(f"Algorithm: {key}, Feature Engineering: {vkey}")
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 Score: {f1_scr}")

LogisticRegression
DecisionTreeClassifier
RandomForestClassifier
BernoulliNB
MultinomialNB
XGBClassifier
