## Importings

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import mlflow
import onnx
import pickle
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType
from skl2onnx.common.data_types import FloatTensorType

## Dataset loading

To avoid time wasting, we'll use only 300 rows

In [2]:
df = pd.read_csv("data/IMDB Dataset.csv", nrows=2000)

## Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     2000 non-null   object
 1   sentiment  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [5]:
df.describe()

Unnamed: 0,review,sentiment
count,2000,2000
unique,2000,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,1005


In [6]:
df["sentiment"].value_counts()

sentiment
positive    1005
negative     995
Name: count, dtype: int64

# Task 1: Preprocess the data

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['processed_review'] = df['review'].apply(preprocess_text)

# Task 2: Train 5 ML models

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_review'], df['sentiment'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Task 3: Track models performance, versions, and parameters using MLflow

In [9]:
mlflow.set_experiment("Review polarity dectection")

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train_vectorized, y_train_encoded)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test_encoded, y_pred)
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("accuracy", accuracy)
        mlflow.log_params(model.get_params())

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

2023/11/30 21:56:44 INFO mlflow.tracking.fluent: Experiment with name 'Review polarity dectection' does not exist. Creating a new experiment.




# Task 4: Save the best model in ONNX format and its preprocessing transformations in pickle format

In [10]:
best_model = mlflow.sklearn.load_model(f'runs:/549614e95d504e3b990d005cfd00f54f/Logistic Regression')

# Save the preprocessing transformations in pickle format
with open("outputs/vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
with open("outputs/label_encoder.pkl", "wb") as label_encoder_file:
    pickle.dump(label_encoder, label_encoder_file)

"""
# Convert the best model to ONNX format
initial_type = [('float_input', FloatTensorType([None]))] 
onx = convert_sklearn(best_model, initial_types=initial_type)
with open("outputs/best_model.onnx", "wb") as f:
    f.write(onx.SerializeToString())
"""

'\n# Convert the best model to ONNX format\ninitial_type = [(\'float_input\', FloatTensorType([None]))] \nonx = convert_sklearn(best_model, initial_types=initial_type)\nwith open("outputs/best_model.onnx", "wb") as f:\n    f.write(onx.SerializeToString())\n'

In [11]:
# Update the number of features in initial_type
initial_type = [('float_input', FloatTensorType([None, X_train_vectorized.shape[1]]))]
# Convert the best model to ONNX format
onx = convert_sklearn(best_model, initial_types=initial_type)
with open("outputs/best_model.onnx", "wb") as f:
    f.write(onx.SerializeToString())