In [64]:
import mlflow

mlflow.set_tracking_uri('http://ec2-18-141-141-49.ap-southeast-1.compute.amazonaws.com:5000')

with mlflow.start_run():
    mlflow.log_param('param1', 15)
    mlflow.log_metric('metric1', 0.89)

🏃 View run hilarious-goose-939 at: http://ec2-18-141-141-49.ap-southeast-1.compute.amazonaws.com:5000/#/experiments/758704658829778702/runs/412a693b54b24407b3b2907a8bd70d8f
🧪 View experiment at: http://ec2-18-141-141-49.ap-southeast-1.compute.amazonaws.com:5000/#/experiments/758704658829778702


In [65]:
#creating baseline model

import numpy as np
import pandas as pd

In [66]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [67]:
df.dropna(inplace=True)

In [68]:
df.drop_duplicates(inplace=True)

In [69]:
df = df[~(df['clean_comment'].str.strip() == '')]

In [70]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [71]:
# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/viet2005/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/viet2005/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [73]:
# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [74]:
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

In [55]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
vectorizer = CountVectorizer(max_features=10000)

In [57]:
X = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['category']

In [58]:
X.shape

(36793, 10000)

In [59]:
mlflow.set_tracking_uri('http://ec2-18-141-141-49.ap-southeast-1.compute.amazonaws.com:5000')

In [60]:
mlflow.set_experiment("RF Baseline")

<Experiment: artifact_location='s3://mlflow-bucket-vietnq/758704658829778702', creation_time=1754150121726, experiment_id='758704658829778702', last_update_time=1754150121726, lifecycle_stage='active', name='RF Baseline', tags={}>

In [61]:
from mlflow.models import infer_signature
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42, stratify=y)

with mlflow.start_run() as run:
    mlflow.set_tag('mlflow.runName', 'RandomForestBaselineTrainTestSplit')
    mlflow.set_tag('experiment_type', 'baseline')
    mlflow.set_tag('model_type', 'RandomForestClassifier')
    mlflow.set_tag('description', 'Baseline RandomForest model for sentiment analysis using Bag of Words (BoW) with a simple train-test split')

    mlflow.log_param('vectorizer_type', 'CountVectorizer')
    mlflow.log_param('vectorizer_max_features', vectorizer.max_features)

    n_estimators = 200
    max_depth = 15
    
    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_param('max_depth', max_depth)

    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric('accuracy', accuracy)

    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f'{label}_{metric}', value)

    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize = (8,6))
    sns.heatmap(conf_matrix, annot = True, fmt = "d", cmap = "Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')

    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('./confusion_matrix.png')

    input_example = X_test[:1]
    signature = infer_signature(X_test, model.predict(X_test))
    mlflow.sklearn.log_model(model, artifact_path = 'random_forest_model', input_example=input_example, signature=signature)
    
    df.to_csv('dataset.csv', index=False)
    mlflow.log_artifact('./dataset.csv')

print(f'Accuracy: {accuracy}')

🏃 View run RandomForestBaselineTrainTestSplit at: http://ec2-18-141-141-49.ap-southeast-1.compute.amazonaws.com:5000/#/experiments/758704658829778702/runs/e588a639794c48b894e54aab37d66821
🧪 View experiment at: http://ec2-18-141-141-49.ap-southeast-1.compute.amazonaws.com:5000/#/experiments/758704658829778702


KeyboardInterrupt: 

In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.01      0.01      1650
           0       0.68      0.80      0.73      2555
           1       0.62      0.85      0.72      3154

    accuracy                           0.64      7359
   macro avg       0.76      0.55      0.49      7359
weighted avg       0.72      0.64      0.56      7359

