# Pipeline for a classification model

- the main goal is to fit the best category for a given handicraft product in the database;

In [None]:
import pandas as pd
import os
import dotenv
import pickle
from scipy import sparse
from datetime import date
from wordcloud import WordCloud
from matplotlib import pyplot as plt

from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [None]:
# load the environment varibables
dotenv.load_dotenv()

dataset_path = os.getenv("DATASET_PATH")
metrics_path = os.getenv("METRICS_PATH")
model_path = os.getenv("MODEL_PATH")
test_path = os.getenv("TEST_PATH")

stop_words = stopwords.words("portuguese")

In [None]:
# load the data
sample = pd.read_csv(dataset_path)

In [None]:
# quick glimpse at the dataset
sample.head()

In [None]:
# see the type and other information of each variable
sample.info()

- Taking a look at the correlation between the numerical variables and categories so one can have an insight for a possible relevant feature;

In [None]:
sample = pd.concat([sample, pd.get_dummies(sample.category)], axis = 1)

In [None]:
sample.corr().loc[sample.category.unique()].style.background_gradient(cmap = "inferno")

- The word clouds ahead help us to understand how the textual features, like queries and title, behave for each category, in such way that make it an interesting feature to use in the model

In [None]:
def cloud_feature(feature: str):
    """ a function to loop over and create wordclouds for each label of the dataset """
    for category in sample.category.unique():
        words = " ".join(word for word in sample.query(f"category == '{category}'")[feature].dropna())

        cloud = WordCloud(
            stopwords = stop_words,
            background_color = "white",
            colormap = "inferno",
            width = 800,
            height = 800
        ).generate(words)

        plt.figure(facecolor = 'white')
        plt.title(category, fontweight = "bold")
        plt.axis("off")
        plt.imshow(cloud, interpolation = "bilinear")
        plt.tight_layout()
        plt.show()

In [None]:
cloud_feature("title")

In [None]:
cloud_feature("query")

In [None]:
cloud_feature("concatenated_tags")

# Preprocessing

- using holdout to evalute the model;
- encode the label to have numerical values for each category;
- bag of words to process the `concatenated_tags` and `title` features;

In [None]:
# define the features and label
X = sample[["concatenated_tags", "title"]]
y = sample["category"]

In [None]:
X.fillna("", inplace = True)
X = X.apply(lambda x: x.str.lower())

In [None]:
# encode the label (dependent variable)
enc = LabelEncoder()

enc.fit(y)

y_enc = enc.transform(y)

In [None]:
count_vectorizer = CountVectorizer(stop_words = stop_words)

X_tags = count_vectorizer.fit_transform(X.concatenated_tags)

X_title = count_vectorizer.transform(X.title)

X_array = sparse.hstack((X_tags, X_title))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_array, y_enc, train_size = 0.8)

# Modelling
- given the fact that the dependent variable (category) is of type object (categorical, if you will) we might use the logistic regression;

In [None]:
logistic = LogisticRegression()

model = logistic.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)

# Metrics

In [None]:
metrics_report = metrics.classification_report(y_val, y_pred, target_names = enc.classes_)
print(metrics_report)

In [None]:
with open(metrics_path, "w") as file:
    file.write(f"Model: {model}\ndate: {date.today()}\n\n")
    file.write(f"Metrics:\n{metrics_report}")

In [None]:
# save the model
with open(model_path, "wb") as file:
    pickle.dump(model, file)

# Test

- applying the model to the test dataset;

In [None]:
test_sample = pd.read_csv(test_path)

In [None]:
test_sample.head()

In [None]:
X_test = test_sample[["concatenated_tags", "title"]]
y_test = test_sample["category"]

In [None]:
X_test.fillna("", inplace = True)
X_test = X_test.apply(lambda x: x.str.lower())

In [None]:
y_test_enc = enc.transform(y_test)

In [None]:
X_test_tags = count_vectorizer.transform(X_test.concatenated_tags)

X_test_title = count_vectorizer.transform(X_test.title)

X_test_array = sparse.hstack((X_test_tags, X_test_title))

In [None]:
y_test_pred = model.predict(X_test_array)

In [None]:
metrics.accuracy_score(y_test, y_test_pred)

In [None]:
print(metrics.classification_report(y_test_enc, y_test_pred, target_names = enc.classes_))