In [25]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import os
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [35]:
df = pd.read_csv('cleaned_amazon_reviews.csv')
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Review'])
y = df['Sentiment']

In [36]:
X

<20000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 550555 stored elements in Compressed Sparse Row format>

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 , shuffle=True)


LogisticRegression

In [13]:
learning_rate = 0.002
epochs = 3

In [14]:
mlflow.set_experiment("amazon_reviews")

# Start an MLflow run
with mlflow.start_run():

    mlflow.set_tag("model_name", "LogisticRegression")

    # Log hyperparameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("epochs", epochs)
    

    # Initialize and train the model
    model = LogisticRegression( solver='liblinear', max_iter=100, warm_start=True)

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the accuracy as a metric
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}")

    mlflow.sklearn.log_model(model, "logistic_regression_model")

    # Save the model as an artifact
    model_filename = "logistic_regression_model.pkl"
    mlflow.sklearn.save_model(model, model_filename)

    # Log the saved model as an artifact
    mlflow.log_artifact(model_filename)


2024/10/20 23:30:15 INFO mlflow.tracking.fluent: Experiment with name 'amazon_reviews' does not exist. Creating a new experiment.


Epoch 1/3 - Accuracy: 0.8423
Epoch 2/3 - Accuracy: 0.8423
Epoch 3/3 - Accuracy: 0.8423




In [15]:
learning_rate = 0.001
mlflow.set_experiment("amazon_reviews")

# Start an MLflow run
with mlflow.start_run():

    mlflow.set_tag("model_name", "LogisticRegression")

    # Log hyperparameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("epochs", epochs)
    

    # Initialize and train the model
    model = LogisticRegression( solver='liblinear', max_iter=100, warm_start=True)

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the accuracy as a metric
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}")

    mlflow.sklearn.log_model(model, "logistic_regression_model")

    # Save the model as an artifact
    model_filename = "logistic_regression_model0.1.pkl"
    mlflow.sklearn.save_model(model, model_filename)

    # Log the saved model as an artifact
    mlflow.log_artifact(model_filename)


Epoch 1/3 - Accuracy: 0.8423
Epoch 2/3 - Accuracy: 0.8423
Epoch 3/3 - Accuracy: 0.8423




SVC

In [7]:
#SVC
C = 1.0
kernel = 'linear'

In [16]:
mlflow.set_experiment("amazon_reviews")

with mlflow.start_run():

    mlflow.set_tag("model_name", "SVC")
    # Log hyperparameters
    mlflow.log_param("C", C)
    mlflow.log_param("kernel", kernel)
    mlflow.log_param("epochs", epochs)

    # Initialize and train the SVM model
    model = SVC(C=C, kernel=kernel, max_iter=100, probability=True)

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the accuracy as a metric
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}")
    
    
    mlflow.sklearn.log_model(model, "SVC")
    # Save the model as an artifact
    model_filename = "svm_model.pkl"
    mlflow.sklearn.save_model(model, model_filename)

    # Log the saved model as an artifact
    mlflow.log_artifact(model_filename)


Epoch 1/3 - Accuracy: 0.6428
Epoch 2/3 - Accuracy: 0.6428
Epoch 3/3 - Accuracy: 0.6428




RandomForestClassifierSVC


In [18]:
n_estimators = 200  
max_depth = 10 

In [19]:
mlflow.set_experiment("amazon_reviews")

with mlflow.start_run():

    mlflow.set_tag("model_name", "RandomForestClassifier")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("epochs", epochs)


    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

    
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}")

    mlflow.sklearn.log_model(model, "RandomForestClassifier")
    model_filename = "random_forest_modell.pkl"
    mlflow.sklearn.save_model(model, model_filename)


    mlflow.log_artifact(model_filename)


Epoch 1/3 - Accuracy: 0.7957
Epoch 2/3 - Accuracy: 0.8000
Epoch 3/3 - Accuracy: 0.8000




Decision Tree

In [20]:
max_depth = 10  
min_samples_split = 4 

In [21]:
mlflow.set_experiment("amazon_reviews")

with mlflow.start_run():

    mlflow.set_tag("model_name", "DecisionTreeClassifier")
    # Log hyperparameters
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("min_samples_split", min_samples_split)
    mlflow.log_param("epochs", epochs)

    # Initialize and train the DecisionTree model
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the accuracy as a metric
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - Accuracy: {accuracy:.4f}")
    
    mlflow.sklearn.log_model(model, "DecisionTreeClassifier")
    # Save the model as an artifact
    model_filename = "decision_tree_modell.pkl"
    mlflow.sklearn.save_model(model, model_filename)

    # Log the saved model as an artifact
    mlflow.log_artifact(model_filename)


Epoch 1/3 - Accuracy: 0.6777
Epoch 2/3 - Accuracy: 0.6757
Epoch 3/3 - Accuracy: 0.6783




In [22]:
learning_rate = 0.001
n_estimators = 100
epochs = 3

In [26]:
mlflow.set_experiment("amazon_reviews")
with mlflow.start_run(run_name="AdaBoost"):
    mlflow.set_tag("model_name", "AdaBoost")

    # Log hyperparameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("epochs", epochs)

    # Initialize and train the AdaBoost model
    model = AdaBoostClassifier(learning_rate=learning_rate, n_estimators=n_estimators)

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the accuracy as a metric
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - AdaBoost Accuracy: {accuracy:.4f}")

    mlflow.sklearn.log_model(model, "ada_boost_model")

    # Save the model as an artifact
    model_filename = "ada_boost_model.pkl"
    mlflow.sklearn.save_model(model, model_filename)

    # Log the saved model as an artifact
    mlflow.log_artifact(model_filename)

Epoch 1/3 - AdaBoost Accuracy: 0.5730
Epoch 2/3 - AdaBoost Accuracy: 0.5730
Epoch 3/3 - AdaBoost Accuracy: 0.5730




In [27]:
mlflow.set_experiment("amazon_reviews")
with mlflow.start_run(run_name="XGBoost"):
    mlflow.set_tag("model_name", "XGBoost")

    # Log hyperparameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("epochs", epochs)

    # Initialize and train the XGBoost model
    model = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, use_label_encoder=False, eval_metric='logloss')

    for epoch in range(epochs):
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the accuracy as a metric
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        print(f"Epoch {epoch+1}/{epochs} - XGBoost Accuracy: {accuracy:.4f}")

    mlflow.sklearn.log_model(model, "xg_boost_model")

    # Save the model as an artifact
    model_filename = "xg_boost_model.pkl"
    mlflow.sklearn.save_model(model, model_filename)

    # Log the saved model as an artifact
    mlflow.log_artifact(model_filename)

Epoch 1/3 - XGBoost Accuracy: 0.6552
Epoch 2/3 - XGBoost Accuracy: 0.6552
Epoch 3/3 - XGBoost Accuracy: 0.6552


