In [2]:
# Step 1: Mount Google Drive and Load Data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

In [4]:
# Load the dataset
train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/train.csv")
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/test.csv")

In [5]:
# Rename columns
train.columns = ["label", "title", "text"]
test.columns = ["label", "title", "text"]

In [6]:
# Estimate memory usage
print("Train dataset memory usage:", train.memory_usage(deep=True).sum() / (1024**2), "MB")
print("Test dataset memory usage:", test.memory_usage(deep=True).sum() / (1024**2), "MB")

Train dataset memory usage: 1894.5975503921509 MB
Test dataset memory usage: 210.42834758758545 MB


In [7]:
from sklearn.utils import resample

In [8]:
# # Downsample if necessary and Resample to balance the dataset
train_positive = train[train['label'] == 2]
train_negative = train[train['label'] == 1]

train_positive_sampled = resample(train_positive, replace=False, n_samples=5000, random_state=42)
train_negative_sampled = resample(train_negative, replace=False, n_samples=5000, random_state=42)

train_sampled = pd.concat([train_positive_sampled, train_negative_sampled])
train_sampled = train_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
test_sampled = test.sample(n=2000, random_state=42)

In [10]:
# Verify the new memory usage
print("Sampled train dataset memory usage:", train_sampled.memory_usage(deep=True).sum() / (1024**2), "MB")
print("Sampled test dataset memory usage:", test_sampled.memory_usage(deep=True).sum() / (1024**2), "MB")

Sampled train dataset memory usage: 5.259720802307129 MB
Sampled test dataset memory usage: 1.0783166885375977 MB


In [11]:
train_sampled.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,5000
2,5000


In [12]:
# Step 2: Data Preprocessing
import re
import string
from sklearn.utils import resample
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.utils import resample

In [13]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [15]:
def clean_text(text):
    text = re.sub(r'[^A-Za-zÀ-ú ]+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatized_tokens = []
    for token, tag in tagged_tokens:
        if token.lower() not in stopwords.words('english'):
            wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
            lemmatized_tokens.append(WordNetLemmatizer().lemmatize(token, pos=wordnet_pos))
    return " ".join(lemmatized_tokens)

In [16]:
# Apply text cleaning, stop words removal, and lemmatization
train_sampled['title'] = train_sampled['title'].astype(str).apply(clean_text).apply(lemmatize_text)
test_sampled['title'] = test_sampled['title'].astype(str).apply(clean_text).apply(lemmatize_text)
train_sampled['text'] = train_sampled['text'].astype(str).apply(clean_text).apply(lemmatize_text)
test_sampled['text'] = test_sampled['text'].astype(str).apply(clean_text).apply(lemmatize_text)


In [17]:
# Combine title and text
train_sampled['combined'] = train_sampled['title'] + " " + train_sampled['text']
test_sampled['combined'] = test_sampled['title'] + " " + test_sampled['text']

In [18]:
# Map labels
train_sampled['label'] = train_sampled['label'].map({1: 0, 2: 1})
test_sampled['label'] = test_sampled['label'].map({1: 0, 2: 1})

In [19]:
# Step 3: Feature Engineering with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_sampled['combined'])
X_test_tfidf = tfidf_vectorizer.transform(test_sampled['combined'])

In [20]:
!pip install mlflow
import mlflow

Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Downloading mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.0->mlflow)
  Downloading databricks_sdk-0.34.0-py3-none-any.whl.metadata (37 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.17.0->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
C

In [21]:
# Set up MLflow experiment
mlflow.set_experiment("sentiment_analysis_experiment")

2024/10/12 15:00:40 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_analysis_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/359423678830423902', creation_time=1728745240642, experiment_id='359423678830423902', last_update_time=1728745240642, lifecycle_stage='active', name='sentiment_analysis_experiment', tags={}>

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Function to log and save heatmap of confusion matrix as an artifact
def log_confusion_matrix_heatmap(y_true, y_pred, model_name):
    conf_matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix Heatmap for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"conf_matrix_heatmap_{model_name}.png")
    mlflow.log_artifact(f"conf_matrix_heatmap_{model_name}.png")
    plt.close()

In [25]:
# Step 4: Model Training and Evaluation

# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

with mlflow.start_run(run_name="Logistic Regression") as run:
    logreg_model = LogisticRegression(max_iter=200)
    logreg_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_logreg = logreg_model.predict(X_test_tfidf)

    mlflow.log_param("logreg_max_iter", 200)
    mlflow.sklearn.log_model(logreg_model, "logreg_model")
    logreg_report = classification_report(test_sampled['label'], y_pred_logreg, output_dict=True)
    mlflow.log_metrics({
        "logreg_accuracy": accuracy_score(test_sampled['label'], y_pred_logreg),
        "logreg_precision": logreg_report['weighted avg']['precision'],
        "logreg_recall": logreg_report['weighted avg']['recall'],
        "logreg_f1-score": logreg_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_logreg, "Logistic Regression")



In [27]:
# Step 5: Hyperparameter Tuning

# SVM
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

with mlflow.start_run(run_name="SVM Hyperparameter Tuning") as run:
    param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']
    }
    grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
    grid.fit(X_train_tfidf, train_sampled['label'])

    for i, (param, score) in enumerate(zip(grid.cv_results_['params'], grid.cv_results_['mean_test_score'])):
        param_name = f"params_{i}_C_{param['C']}_kernel_{param['kernel']}"
        mlflow.log_metric(param_name, score)

    mlflow.log_param("svm_best_C", grid.best_params_['C'])
    mlflow.log_param("svm_best_kernel", grid.best_params_['kernel'])
    mlflow.log_metric("svm_best_score", grid.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ...............................C=0.1, kernel=linear; total time=  25.2s
[CV] END ...............................C=0.1, kernel=linear; total time=  25.9s
[CV] END ...............................C=0.1, kernel=linear; total time=  26.5s
[CV] END ...............................C=0.1, kernel=linear; total time=  24.9s
[CV] END ...............................C=0.1, kernel=linear; total time=  28.7s
[CV] END ..................................C=0.1, kernel=rbf; total time=  31.8s
[CV] END ..................................C=0.1, kernel=rbf; total time=  33.4s
[CV] END ..................................C=0.1, kernel=rbf; total time=  32.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  32.5s
[CV] END ..................................C=0.1, kernel=rbf; total time=  31.6s
[CV] END .................................C=1, kernel=linear; total time=  16.9s
[CV] END .................................C=1, ke

In [28]:
# Step 6: Final Model Training and Evaluation

# SVM
with mlflow.start_run(run_name="SVM Model") as run:
    best_svm_model = SVC(C=grid.best_params_['C'], kernel=grid.best_params_['kernel'])
    best_svm_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_svm = best_svm_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(best_svm_model, "svm_model")
    svm_report = classification_report(test_sampled['label'], y_pred_svm, output_dict=True)
    mlflow.log_metrics({
        "svm_accuracy": accuracy_score(test_sampled['label'], y_pred_svm),
        "svm_precision": svm_report['weighted avg']['precision'],
        "svm_recall": svm_report['weighted avg']['recall'],
        "svm_f1-score": svm_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_svm, "SVM")



In [29]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run(run_name="Random Forest Model") as run:
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_rf = rf_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(rf_model, "rf_model")
    rf_report = classification_report(test_sampled['label'], y_pred_rf, output_dict=True)
    mlflow.log_metrics({
        "rf_accuracy": accuracy_score(test_sampled['label'], y_pred_rf),
        "rf_precision": rf_report['weighted avg']['precision'],
        "rf_recall": rf_report['weighted avg']['recall'],
        "rf_f1-score": rf_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_rf, "Random Forest")



In [30]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

with mlflow.start_run(run_name="Gradient Boosting Model") as run:
    gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_gb = gb_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(gb_model, "gb_model")
    gb_report = classification_report(test_sampled['label'], y_pred_gb, output_dict=True)
    mlflow.log_metrics({
        "gb_accuracy": accuracy_score(test_sampled['label'], y_pred_gb),
        "gb_precision": gb_report['weighted avg']['precision'],
        "gb_recall": gb_report['weighted avg']['recall'],
        "gb_f1-score": gb_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_gb, "Gradient Boosting")



In [31]:
# Voting Classifier (Ensemble)
from sklearn.ensemble import VotingClassifier

with mlflow.start_run(run_name="Voting Classifier Model") as run:
    voting_clf = VotingClassifier(estimators=[
        ('lr', logreg_model),
        ('svm', best_svm_model),
        ('rf', rf_model),
        ('gb', gb_model)
    ], voting='hard')
    voting_clf.fit(X_train_tfidf, train_sampled['label'])
    y_pred_voting = voting_clf.predict(X_test_tfidf)

    mlflow.sklearn.log_model(voting_clf, "voting_clf_model")
    voting_report = classification_report(test_sampled['label'], y_pred_voting, output_dict=True)
    mlflow.log_metrics({
        "voting_accuracy": accuracy_score(test_sampled['label'], y_pred_voting),
        "voting_precision": voting_report['weighted avg']['precision'],
        "voting_recall": voting_report['weighted avg']['recall'],
        "voting_f1-score": voting_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_voting, "Voting Classifier")



In [32]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

with mlflow.start_run(run_name="AdaBoost Model") as run:
    ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)
    ada_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_ada = ada_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(ada_model, "ada_model")
    ada_report = classification_report(test_sampled['label'], y_pred_ada, output_dict=True)
    mlflow.log_metrics({
        "ada_accuracy": accuracy_score(test_sampled['label'], y_pred_ada),
        "ada_precision": ada_report['weighted avg']['precision'],
        "ada_recall": ada_report['weighted avg']['recall'],
        "ada_f1-score": ada_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_ada, "AdaBoost")



In [33]:
# Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

with mlflow.start_run(run_name="Extra Trees Model") as run:
    et_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
    et_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_et = et_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(et_model, "et_model")
    et_report = classification_report(test_sampled['label'], y_pred_et, output_dict=True)
    mlflow.log_metrics({
        "et_accuracy": accuracy_score(test_sampled['label'], y_pred_et),
        "et_precision": et_report['weighted avg']['precision'],
        "et_recall": et_report['weighted avg']['recall'],
        "et_f1-score": et_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_et, "Extra Trees")



In [34]:
# XGBoost
import xgboost as xgb

with mlflow.start_run(run_name="XGBoost Model") as run:
    xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    xgb_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_xgb = xgb_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(xgb_model, "xgb_model")
    xgb_report = classification_report(test_sampled['label'], y_pred_xgb, output_dict=True)
    mlflow.log_metrics({
        "xgb_accuracy": accuracy_score(test_sampled['label'], y_pred_xgb),
        "xgb_precision": xgb_report['weighted avg']['precision'],
        "xgb_recall": xgb_report['weighted avg']['recall'],
        "xgb_f1-score": xgb_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_xgb, "XGBoost")



In [35]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [36]:
# CatBoost
from catboost import CatBoostClassifier

with mlflow.start_run(run_name="CatBoost Model") as run:
    cat_model = CatBoostClassifier(n_estimators=100, random_state=42, verbose=0)
    cat_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_cat = cat_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(cat_model, "cat_model")
    cat_report = classification_report(test_sampled['label'], y_pred_cat, output_dict=True)
    mlflow.log_metrics({
        "cat_accuracy": accuracy_score(test_sampled['label'], y_pred_cat),
        "cat_precision": cat_report['weighted avg']['precision'],
        "cat_recall": cat_report['weighted avg']['recall'],
        "cat_f1-score": cat_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_cat, "CatBoost")



In [37]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.9.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.9.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.9.0 scikit-optimize-0.10.2


In [38]:
# Multinomial Naive Bayes Model Run
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

with mlflow.start_run(run_name="Multinomial Naive Bayes Model") as run:
    nb_model = MultinomialNB()
    nb_model.fit(X_train_tfidf, train_sampled['label'])
    y_pred_nb = nb_model.predict(X_test_tfidf)

    mlflow.sklearn.log_model(nb_model, "nb_model")
    nb_report = classification_report(test_sampled['label'], y_pred_nb, output_dict=True)
    mlflow.log_metrics({
        "nb_accuracy": accuracy_score(test_sampled['label'], y_pred_nb),
        "nb_precision": nb_report['weighted avg']['precision'],
        "nb_recall": nb_report['weighted avg']['recall'],
        "nb_f1-score": nb_report['weighted avg']['f1-score']
    })

    log_confusion_matrix_heatmap(test_sampled['label'], y_pred_nb, "Multinomial Naive Bayes")



In [39]:
# Register the models
mlflow.register_model(f"runs:/{run.info.run_id}/logreg_model", "LogisticRegressionModel")
mlflow.register_model(f"runs:/{run.info.run_id}/svm_model", "SVMModel")
mlflow.register_model(f"runs:/{run.info.run_id}/rf_model", "RandomForestModel")
mlflow.register_model(f"runs:/{run.info.run_id}/gb_model", "GradientBoostingModel")
mlflow.register_model(f"runs:/{run.info.run_id}/voting_clf_model", "VotingClassifierModel")

Successfully registered model 'LogisticRegressionModel'.
Created version '1' of model 'LogisticRegressionModel'.
Successfully registered model 'SVMModel'.
Created version '1' of model 'SVMModel'.
Successfully registered model 'RandomForestModel'.
Created version '1' of model 'RandomForestModel'.
Successfully registered model 'GradientBoostingModel'.
Created version '1' of model 'GradientBoostingModel'.
Successfully registered model 'VotingClassifierModel'.
Created version '1' of model 'VotingClassifierModel'.


<ModelVersion: aliases=[], creation_timestamp=1728747991049, current_stage='None', description=None, last_updated_timestamp=1728747991049, name='VotingClassifierModel', run_id='456130c81342455096ef0f8cfe24042a', run_link=None, source='file:///content/mlruns/359423678830423902/456130c81342455096ef0f8cfe24042a/artifacts/voting_clf_model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [43]:
!pip install pyngrok
from pyngrok import ngrok



In [41]:
# Start MLflow UI
get_ipython().system_raw("mlflow ui --port 5000 &")

In [48]:
# Create a tunnel to the local environment
from pyngrok import ngrok

# Replace 'YOUR_NGROK_AUTH_TOKEN' with your actual ngrok authtoken
ngrok.set_auth_token("")

# Create a tunnel to the local environment
public_url = ngrok.connect(5000)
print(f"MLflow Tracking UI available at: {public_url}")


MLflow Tracking UI available at: NgrokTunnel: "https://68a0-34-73-83-226.ngrok-free.app" -> "http://localhost:5000"
