In [176]:
import numpy as numpy
import pandas as pd
import mlflow

In [177]:
mlflow.set_tracking_uri('http://ec2-51-20-64-139.eu-north-1.compute.amazonaws.com:5000')

In [178]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [180]:
df.dropna(inplace=True)

In [181]:
df.drop_duplicates(inplace=True)

In [182]:
df = df[~(df['clean_comment'].str.strip()=='')]

In [183]:
df.shape

(36793, 2)

In [184]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Appsquadz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Appsquadz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [185]:
def preprocess_df(df):
    
    # lower case
    df['clean_comment'] = df['clean_comment'].apply(lambda x: x.lower())

    # strip
    df['clean_comment'] = df['clean_comment'].apply(lambda x: x.strip())

    # remove newline char
    df['clean_comment'] = df['clean_comment'].apply(lambda x: re.sub(r'\n', ' ', x))

    # remove non alpha-numeric chars
    df['clean_comment'] = df['clean_comment'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s?.,!]', '', x))

    # remove some stopwords
    stopword = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    df['clean_comment'] = df['clean_comment'].apply(lambda x: " ".join([word for word in x.split() if word not in stopword]))

    # apply lemmatization
    lemmatizer = WordNetLemmatizer()
    df['clean_comment'] = df['clean_comment'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

    return df

In [186]:
processed_df = preprocess_df(df)

In [187]:
processed_df.sample(5)

Unnamed: 0,clean_comment,category
26223,believe done,0
13289,not matter ruling political party minister alw...,0
13798,someone preparing competitive exam creating lo...,1
22509,also think interesting even know faye male female,1
29876,streanh everyone,0


In [188]:
from sklearn.feature_extraction.text import CountVectorizer

In [189]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [190]:
X_train, X_test, y_train, y_test = train_test_split(processed_df['clean_comment'], processed_df['category'], test_size=0.2, random_state=42, stratify=processed_df['category'])
vectorizer = CountVectorizer(max_features=5000)
X_train_trf = vectorizer.fit_transform(X_train).toarray()
X_test_trf = vectorizer.transform(X_test).toarray()

In [191]:
mlflow.set_experiment("Exp 1 - RF baseline")
with mlflow.start_run() as run:
    mlflow.set_tag("mlflow.runName", "rf_baseline")
    mlflow.set_tag("experiment_type", "Baseline")
    mlflow.set_tag("model_type", "Random Forest Classifier")
    mlflow.set_tag("description", "Baseline model with BOW and simple train test split")

    mlflow.log_param("vectorizer type", "CountVectorizer")
    mlflow.log_param("vectorizer_max_features", vectorizer.max_features)

    n_estimators = 200
    max_depth = 15
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train_trf, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_trf)

    # Log metrics for each class and accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    # Classification report
    class_report = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in class_report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Confusion metric
    confusion_met = confusion_matrix(y_pred, y_test)
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_met, annot=True, fmt=".1f", cmap="viridis")
    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.title("confusion metrix")

    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")

    
    # Log the Random Forest model
    mlflow.sklearn.log_model(model, "random_forest_model")

    # Optionally log the dataset itself (if it's small enough)
    processed_df.to_csv("dataset.csv", index=False)
    mlflow.log_artifact("dataset.csv")

    # Display final accuracy
    print(f"classification report: {class_report}")

MlflowException: API request to http://ec2-13-61-2-37.eu-north-1.compute.amazonaws.com:5000/api/2.0/mlflow/experiments/get-by-name failed with timeout exception HTTPConnectionPool(host='ec2-13-61-2-37.eu-north-1.compute.amazonaws.com', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Exp+1+-+RF+baseline (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000013F24A846B0>, 'Connection to ec2-13-61-2-37.eu-north-1.compute.amazonaws.com timed out. (connect timeout=120)')). To increase the timeout, set the environment variable MLFLOW_HTTP_REQUEST_TIMEOUT (default: 120, type: int) to a larger value.

In [158]:
print(class_report)

{'-1': {'precision': 0.9743589743589743, 'recall': 0.02303030303030303, 'f1-score': 0.044997039668442866, 'support': 1650.0}, '0': {'precision': 0.6504796163069544, 'recall': 0.8493150684931506, 'f1-score': 0.7367170259718214, 'support': 2555.0}, '1': {'precision': 0.6440763052208835, 'recall': 0.813570069752695, 'f1-score': 0.7189688988512188, 'support': 3154.0}, 'accuracy': 0.648729446935725, 'macro avg': {'precision': 0.7563049652956041, 'recall': 0.5619718137587162, 'f1-score': 0.5002276548304944, 'support': 7359.0}, 'weighted avg': {'precision': 0.7203539059686428, 'recall': 0.648729446935725, 'f1-score': 0.5740161739078242, 'support': 7359.0}}
