In [14]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [15]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [16]:
# data preprocessing

# Define text preprocessing functions
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    """Convert text to lower case."""
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('؛', "")
    text = re.sub('\s+', ' ', text).strip()
    return text

def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def normalize_text(df):
    """Normalize the text data."""
    try:
        df['content'] = df['content'].apply(lower_case)
        df['content'] = df['content'].apply(remove_stop_words)
        df['content'] = df['content'].apply(removing_numbers)
        df['content'] = df['content'].apply(removing_punctuations)
        df['content'] = df['content'].apply(removing_urls)
        df['content'] = df['content'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

  text = re.sub('\s+', ' ', text).strip()


In [17]:
# map sentiment to integer
sentiment={'empty':0,'sadness':1,'enthusiasm':2,'neutral':3,'worry':4,'surprise':5,'love':6,'fun':7,'hate':8,'happiness':9,'boredom':10,'relief':11,'anger':12}
df['sentiment']=df['sentiment'].map(sentiment)

In [None]:
# Normalize the text data
df = normalize_text(df)
df=df.sample(round(len(df)*0.5), random_state=42, replace=False).reset_index()

Unnamed: 0,index,sentiment,content
0,15644,1,want hair cut mom wont cut
1,9591,1,long day office tiring week
2,9641,3,lazy day staying foot much possible
3,27054,3,janissharp thanks janis
4,33682,0,got nashville ihop staff hit dougie mayne fuck...
...,...,...,...
9995,23152,9,tomfelton http twitpic com dmtn saw pic remind...
9996,39605,5,finally started serious revision completed eng...
9997,33336,9,watching season episode house bed great bedtim...
9998,4523,1,think computer sick


In [19]:
df

Unnamed: 0,sentiment,content
32823,3,good morning
16298,0,put computer craigslist i ve case monitor spea...
28505,6,ten minute shopping demi lovato back around de...
6689,3,twitterberry moved ubertwitter suffered bb cac...
26893,1,thriftymom tear
...,...,...
14965,4,currently drinking effervescent vitamin c sore...
22133,9,rainstopper lok ef too jummy
5442,1,need get life together aka apartment cleaned b...
21456,3,rosehwang thanks


In [16]:
vectorize=CountVectorizer(max_features=1500)
X=vectorize.fit_transform(df['content']).toarray()
y=df['sentiment']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# reshape y_train and y_test
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)


# print the shape of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")    
print(f"y_test shape: {y_test.shape}")

X_train shape: (32000, 1500)
X_test shape: (8000, 1500)
y_train shape: (32000, 1)
y_test shape: (8000, 1)


In [17]:
import dagshub

import mlflow
mlflow.set_tracking_uri('https://dagshub.com/manikantmnnit/mini_project_mlops.mlflow') # Set the MLflow tracking URI to the DagsHub project's MLflow server
dagshub.init(repo_owner='manikantmnnit', repo_name='mini_project_mlops', mlflow=True)
mlflow.set_experiment('logistic Regression Baseline')  # set the experiment name to 'logistic Regression Baseline'  

2025/01/21 12:39:04 INFO mlflow.tracking.fluent: Experiment with name 'logistic Regression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/2ecec70d08aa45519d5890c598f0ac6b', creation_time=1737484743038, experiment_id='0', last_update_time=1737484743038, lifecycle_stage='active', name='logistic Regression Baseline', tags={}>

In [29]:
with mlflow.start_run():
    
    # Log preprocessing parameters
    mlflow.log_param("vectorizer", "Bag of Words")
    mlflow.log_param("num_features", 1500)
    mlflow.log_param("test_size", 0.2)
    
    # Model building and training
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs') # multinomial logistic regression  and lbfgs solver for optimization  
    model.fit(X_train, y_train)

    # Log model parameters
    mlflow.log_param("model", "Logistic Regression")
    
    # Model evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
   


    # Log model metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🏃 View run secretive-asp-896 at: https://dagshub.com/manikantmnnit/mini_project_mlops.mlflow/#/experiments/0/runs/76ca08e03a2943dea069807ae18c537a
🧪 View experiment at: https://dagshub.com/manikantmnnit/mini_project_mlops.mlflow/#/experiments/0


In [28]:
# Logistic Regression with One-vs-Rest
model_ovr = LogisticRegression(multi_class='ovr', solver='liblinear')
model_ovr.fit(X_train, y_train)
y_pred_ovr = model_ovr.predict(X_test)


with mlflow.start_run():
   # Log preprocessing parameters
   mlflow.log_param("vectorizer", "Bag of Words")   
   mlflow.log_param("num_features", 1500)
   mlflow.log_param("test_size", 0.2) 

    # Model building and training

   model_ovr = LogisticRegression(multi_class='ovr', solver='liblinear') # one-vs-rest logistic regression  and liblinear solver for optimization
   model_ovr.fit(X_train, y_train)
   
    # Log model parameters
   mlflow.log_param("model", "Logistic Regression One-vs-Rest")
   
    # Model evaluation
   y_pred_ovr = model_ovr.predict(X_test)
   accuracy_ovr = accuracy_score(y_test, y_pred_ovr)

   precision_ovr = precision_score(y_test, y_pred_ovr, average='micro')
   recall_ovr = recall_score(y_test, y_pred_ovr, average='micro')
   f1_ovr = f1_score(y_test, y_pred_ovr, average='micro')
   
      # Log model metrics
   mlflow.log_metric("accuracy", accuracy_ovr)
   mlflow.log_metric("precision", precision_ovr)
   mlflow.log_metric("recall", recall_ovr)
   

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


🏃 View run indecisive-bird-120 at: https://dagshub.com/manikantmnnit/mini_project_mlops.mlflow/#/experiments/0/runs/44c8f5b6693f4713be2569ef57951cf2
🧪 View experiment at: https://dagshub.com/manikantmnnit/mini_project_mlops.mlflow/#/experiments/0
