In [23]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from mlflow.tracking import MlflowClient
import matplotlib.dates as mdates
import mlflow
import joblib
import re 

In [None]:
df = pd.DataFrame(
    ["video great", "absolutely hate video, not recommending anyone."], columns=["comments"]
)
df

Unnamed: 0,comments
0,video great
1,"absolutely hate video, not recommending anyone."


In [13]:
df.comments.values

array(['video great', 'absolutely hate video, not recommending anyone.'],
      dtype=object)

In [14]:
# Define the preprocessing function
def preprocess_comment(comment):
    """Apply preprocessing transformations to a comment."""
    try:
        # Convert to lowercase
        comment = comment.lower()

        # Remove trailing and leading whitespaces
        comment = comment.strip()

        # Remove newline characters
        comment = re.sub(r"\n", " ", comment)

        # Remove non-alphanumeric characters, except punctuation
        comment = re.sub(r"[^A-Za-z0-9\s!?.,]", "", comment)

        # Remove stopwords but retain important ones for sentiment analysis
        stop_words = set(stopwords.words("english")) - {"not", "but", "however", "no", "yet"}
        comment = " ".join([word for word in comment.split() if word not in stop_words])

        # Lemmatize the words
        lemmatizer = WordNetLemmatizer()
        comment = " ".join([lemmatizer.lemmatize(word) for word in comment.split()])

        return comment
    except Exception as e:
        print(f"Error in preprocessing comment: {e}")
        return comment

In [15]:
def load_model_and_vectorizer(model_name, model_version, vectorizer_path):
    # Set MLflow tracking URI to your server
    mlflow.set_tracking_uri(
        # "http://ec2-54-196-109-131.compute-1.amazonaws.com:5000/"
        "http://127.0.0.1:5000/"
    )  # Replace with your MLflow tracking URI

    client = MlflowClient()
    model_uri = f"models:/{model_name}/{model_version}"
    model = mlflow.pyfunc.load_model(model_uri)
    vectorizer = joblib.load(vectorizer_path)  # Load the vectorizer
    return model, vectorizer

In [18]:
# Initialize the model and vectorizer
model, vectorizer = load_model_and_vectorizer(
    "yt_chrome_plugin_model", "3", "../tfidf_vectorizer.joblib"
)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [20]:
model = joblib.load('../lgbm_model.joblib')
vectorizer = joblib.load('../tfidf_vectorizer.joblib')

In [21]:
comments = [
    "This video is great",
    "I absolutely hate this video, not recommending to anyone.",
    "I really like your teaching style",
]

# Preprocess each comment before vectorizing
preprocessed_comments = [preprocess_comment(comment) for comment in comments]

print("pre-processed comments")
print(preprocessed_comments)

# Transform comments using the vectorizer
transformed_comments = vectorizer.transform(preprocessed_comments)

print("after transformation...")
print(transformed_comments)

# Make predictions
predictions = model.predict(transformed_comments).tolist()  # Convert to list

# Convert predictions to strings for consistency
predictions = [str(pred) for pred in predictions]

pre-processed comments
['video great', 'absolutely hate video, not recommending anyone.', 'really like teaching style']
after transformation...
  (np.int32(0), np.int32(3700))	0.6708645876252822
  (np.int32(0), np.int32(9514))	0.7415798709986402
  (np.int32(1), np.int32(143))	0.42820272554945277
  (np.int32(1), np.int32(519))	0.3419813766993476
  (np.int32(1), np.int32(3874))	0.3618025797504715
  (np.int32(1), np.int32(6046))	0.20177053784985297
  (np.int32(1), np.int32(9514))	0.3728537569242583
  (np.int32(1), np.int32(9518))	0.6237457680112866
  (np.int32(2), np.int32(4989))	0.22133055676994662
  (np.int32(2), np.int32(7369))	0.29661277205758135
  (np.int32(2), np.int32(7377))	0.5739215495933175
  (np.int32(2), np.int32(8624))	0.4933543454252502
  (np.int32(2), np.int32(8848))	0.5387478007886909


In [22]:
predictions

['1', '-1', '1']