In [1]:
import numpy as np
import pandas as pd
import mlflow
import dagshub

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [3]:
import re

def lower_case(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)


def replace_pattern(pattern, text):
    text = re.sub(pattern, ' ', text)
    return text


def replace_pattern_with_pattern(pattern1, pattern2, text):
    text = re.sub(pattern1, pattern2, text)
    return text

In [4]:
df['content'] = df['content'].apply(lambda x: lower_case(x))
url_pattern = r'https?://[^\s]*|www\.[^\s]*'
user_pattern = r'@[^\s]*'
df['content'] = df['content'].apply(lambda x: replace_pattern(url_pattern, x))
df['content'] = df['content'].apply(lambda x: replace_pattern(user_pattern, x))
non_alphanumeric_pattern = r'[^a-zA-Z0-9]'
df['content'] = df['content'].apply(lambda x: replace_pattern(non_alphanumeric_pattern, x))
search_pattern = r'(.)\1{2,}'
replace_with = r'\1\1'
df['content'] = df['content'].apply(lambda x: replace_pattern_with_pattern(search_pattern, replace_with, x))

In [5]:
df.sample(10)

Unnamed: 0,sentiment,content
13643,love,stressed i want prom to be perfect for me amp...
12048,fun,now it s weekend it s so great i saw the res...
1776,surprise,nightshift i m still working luckily having ...
14266,hate,what really that sucks
4392,sadness,i have to wok a nite shift tonite so will mi...
27341,happiness,time for school i m feelin good 5 am jogs do ...
2426,surprise,grr we are only allowed gas grills where i ...
27715,happiness,were you at the parade yesterday at powderho...
17821,worry,going through security already miss my baby
13729,neutral,the office craving ice cream


In [6]:
df = df[df['sentiment'].isin(['happiness', 'sadness', 'love'])]

In [7]:
df.sample(10)

Unnamed: 0,sentiment,content
15991,sadness,poor greg stupid uk always doing the sympathy...
3139,sadness,english screwed up my overall percentage in ...
25196,happiness,hey everybody hah this day is cool just got ...
29455,sadness,wondering what misery looks like look at the ...
6281,happiness,i don t feel like having to reboot if only o...
33242,happiness,hey chelsee its amiera lt ufc gt
9638,happiness,she is good so gor juz yea i kno i asked he...
36007,happiness,i will go to sleep now might be awakened ea...
28013,sadness,idk why im so hyper im jumping everyhere ugh ...
31962,love,aw honey you just relaxin that s probably ...


In [8]:
df['sentiment'] = df['sentiment'].map({'happiness':0, 'love':0, 'sadness':1})

In [9]:
df.sample(10)

Unnamed: 0,sentiment,content
26489,0,good day all let s see what we can get accomp...
4228,1,well that s no way to start the day off
3724,1,i am afraid you ve had a bit of a fail as t...
2158,1,has had awful hiccups today and now can t fall...
26875,0,man i love your shelves way to go diy diva
12257,0,nice hair and nowhere to go
1851,1,still sick home
23167,0,ideas gallore rock opera ftw yes tweets kayl...
33791,0,good night all 15 miles of trails with 7 peek...
18283,0,maan if i wasnt in pain i d come i wanna co...


In [10]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = [word for word in text.split() if word not in stopword]
    return " ".join(new_text)

df['content'] = df['content'].apply(lambda x: remove_stopwords(x))

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# apply lemmatization
def apply_lemmatization(text):
    new_text = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(new_text)

In [12]:
df['content'] = df['content'].apply(lambda x: apply_lemmatization(x))

In [13]:
df.head()

Unnamed: 0,sentiment,content
1,1,layin n bed headache ughh waitin call
2,1,funeral ceremony gloomy friday
6,1,sleep im thinking old friend want married damn...
8,1,charlene love miss
9,1,sorry least friday


In [14]:
mlflow.set_tracking_uri('https://dagshub.com/neerajstd159/mlflow-demo.mlflow')
dagshub.init(repo_owner="neerajstd159", repo_name="mlflow-demo", mlflow=True)
mlflow.set_experiment("model selection")

<Experiment: artifact_location='mlflow-artifacts:/8479dc744197449c931a9e7ccde110f4', creation_time=1734866948303, experiment_id='2', last_update_time=1734866948303, lifecycle_stage='active', name='model selection', tags={}>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from xgboost import XGBClassifier

vectorizer = {
    "BoW": CountVectorizer(),
    "tfidf": TfidfVectorizer()
}

algos = {
    "LogisticRegression" : LogisticRegression(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "BernoulliNB" : BernoulliNB(),
    "MultinomialNB" : MultinomialNB(),
    "XGBClassifier" : XGBClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
}

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import os

In [17]:
with mlflow.start_run(run_name="All Experiments") as parent_run:
    for key, algo in algos.items():
        for vkey, vectr in vectorizer.items():
            with mlflow.start_run(run_name=f"{key} with {vkey}", nested=True) as child_run:
                X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2)

                # log params
                mlflow.log_param("vectorizer", vkey)
                mlflow.log_param("algorithm", key)
                mlflow.log_param("test_size", 0.2)
                
                X_train_trf = vectr.fit_transform(X_train)
                X_test_trf = vectr.transform(X_test)

                algo.fit(X_train_trf, y_train)
                y_pred = algo.predict(X_test_trf)

                # log model params
                # Log model parameters based on the classifier type
                if key == 'LogisticRegression':
                    mlflow.log_param("C", algo.C)
                elif key == 'DecisionTreeClassifier':
                    mlflow.log_param("max_depth", algo.max_depth)
                    mlflow.log_param("min_samples_split", algo.min_samples_split)
                elif key == 'RandomForestClassifier':
                    mlflow.log_param("n_estimators", algo.n_estimators)
                    mlflow.log_param("max_depth", algo.max_depth)
                elif key == 'BernoulliNB' or key == 'MultinomialNB':
                    mlflow.log_param("alpha", algo.alpha)
                elif key == 'XGBClassifier':
                    mlflow.log_param("learning_rate", algo.learning_rate)
                    mlflow.log_param("n_estimators", algo.n_estimators)
                elif key == 'GradientBoostingClassifier':
                    mlflow.log_param("learning_rate", algo.learning_rate)
                    mlflow.log_param("n_estimators", algo.n_estimators)


                accuracy = accuracy_score(y_pred, y_test)
                precision = precision_score(y_pred, y_test)
                f1_scr = f1_score(y_pred, y_test)
                recall = recall_score(y_pred, y_test)

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                mlflow.log_metric("f1_score", f1_scr)

                # log model
                mlflow.sklearn.log_model(algo, f"{key}")

                # save and log file
                notebook_path = "select_model.ipynb"
                os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
                mlflow.log_artifact(notebook_path)

                # Print the results for verification
                print(f"Algorithm: {key}, Feature Engineering: {vkey}")
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 Score: {f1_scr}")



Algorithm: LogisticRegression, Feature Engineering: BoW
Accuracy: 0.79957805907173
Precision: 0.6584440227703985
Recall: 0.7676991150442478
F1 Score: 0.7088866189989785
🏃 View run LogisticRegression with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/23a07fbb5d7945018b61723d8db90f68
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: LogisticRegression, Feature Engineering: tfidf
Accuracy: 0.7992264416315049
Precision: 0.609360076408787
Recall: 0.7975
F1 Score: 0.6908500270709258
🏃 View run LogisticRegression with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/97a0b9a361f542ac9ebf719b8682dfeb
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: DecisionTreeClassifier, Feature Engineering: BoW
Accuracy: 0.7721518987341772
Precision: 0.6751717369970559
Recall: 0.6845771144278607
F1 Score: 0.6798418972332015
🏃 View run DecisionTreeClassifier with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/bc8624d0be2240ebb87306a0356c0d79
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: DecisionTreeClassifier, Feature Engineering: tfidf
Accuracy: 0.7644163150492265
Precision: 0.6707193515704154
Recall: 0.6573982125124131
F1 Score: 0.6639919759277834
🏃 View run DecisionTreeClassifier with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/568f430285ac44f2b75222662b64f4e6
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: RandomForestClassifier, Feature Engineering: BoW
Accuracy: 0.8090717299578059
Precision: 0.6993464052287581
Recall: 0.7721649484536083
F1 Score: 0.7339539441450269
🏃 View run RandomForestClassifier with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/de8b6cbcbf7d4752b7b2bbedc2ffcc0d
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: RandomForestClassifier, Feature Engineering: tfidf
Accuracy: 0.8034458509142054
Precision: 0.6571709233791748
Recall: 0.7610921501706485
F1 Score: 0.7053241960991038
🏃 View run RandomForestClassifier with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/60ed9bf97aec4c5fa33b8a6c16b6b632
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: BernoulliNB, Feature Engineering: BoW
Accuracy: 0.7834036568213784
Precision: 0.562058526740666
Recall: 0.7537212449255751
F1 Score: 0.6439306358381502
🏃 View run BernoulliNB with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/0b889cbd3e72465aaacecf1b3aa597d4
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: BernoulliNB, Feature Engineering: tfidf
Accuracy: 0.780239099859353
Precision: 0.5537583254043768
Recall: 0.7886178861788617
F1 Score: 0.650642817216322
🏃 View run BernoulliNB with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/867187638fd3471c81a9926ec0a6829a
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: MultinomialNB, Feature Engineering: BoW
Accuracy: 0.8055555555555556
Precision: 0.6463878326996197
Recall: 0.7897793263646922
F1 Score: 0.7109252483010977
🏃 View run MultinomialNB with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/353a0fff71f34d2596fa68bcb16e03fe
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: MultinomialNB, Feature Engineering: tfidf
Accuracy: 0.7725035161744023
Precision: 0.4323529411764706
Recall: 0.8664047151277013
F1 Score: 0.5768476128188358
🏃 View run MultinomialNB with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/00abeef009fe4aa8a759764f21468406
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: XGBClassifier, Feature Engineering: BoW
Accuracy: 0.8118846694796061
Precision: 0.608739837398374
Recall: 0.7997329773030708
F1 Score: 0.6912867859203693
🏃 View run XGBClassifier with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/ecf93ab1257242e092c452c1a8baf976
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: XGBClassifier, Feature Engineering: tfidf
Accuracy: 0.7872714486638537
Precision: 0.5610687022900763
Recall: 0.8021828103683493
F1 Score: 0.6603032004491859
🏃 View run XGBClassifier with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/5d0822ce409842568c7c67c9569226ce
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: GradientBoostingClassifier, Feature Engineering: BoW
Accuracy: 0.7482419127988749
Precision: 0.36663336663336665
Recall: 0.8173719376391982
F1 Score: 0.5062068965517241
🏃 View run GradientBoostingClassifier with BoW at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/c6623b19b29e44fab8f0055ade50693b
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2




Algorithm: GradientBoostingClassifier, Feature Engineering: tfidf
Accuracy: 0.7468354430379747
Precision: 0.40754369825206993
Recall: 0.8535645472061657
F1 Score: 0.5516811955168119
🏃 View run GradientBoostingClassifier with tfidf at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/23512c3ecbcc40bcb3af154fd7908ae6
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2
🏃 View run All Experiments at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2/runs/050b5b11b29946c28674082399aa769b
🧪 View experiment at: https://dagshub.com/neerajstd159/mlflow-demo.mlflow/#/experiments/2
