# Features evalation

In [None]:
# Change directory to projects root
import os
os.chdir("/home/app/src")

# Imports
from scripts.build_df import build_df
import text_normalizer
import hiclass.metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from model import evaluation
from scripts.tree_utils import make_tree
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import pandas as pd
import time

In [None]:
# Auxiliary function for text normalization
def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

# Auxiliary function for creating new columns
def assign_precision(x):
    return hiclass.metrics.precision(x[0:1], x[1:2])
def assign_recall(x):
    return hiclass.metrics.recall(x[0:1], x[1:2])
def assign_f1(x):
    # Condition to avoid ZeroDivisionError
    if x["precision"] * x["recall"] == 0.0:
        return 0.0
    else:
        return 2 * x["precision"] * x["recall"] / (x["precision"] + x["recall"])

In [None]:
no_threshold_df = build_df(json_path='data/products.json', threshold=100) 

In [None]:
normalized_names = no_threshold_df['name'].copy()
normalized_names = normalization(normalized_names.apply(str))

normalized_descriptions = no_threshold_df['description'].copy()
normalized_descriptions = normalization(normalized_descriptions.apply(str))

normalized_names_plus_description = pd.Series(normalized_names).apply(lambda x: x + " ") + pd.Series(normalized_descriptions)

y = no_threshold_df['leaf'].copy()

In [None]:
tree_dict = make_tree(no_threshold_df, no_threshold_df['category'], 'root', display_tree= False)

## Baseline names

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    normalized_names, y,
    test_size=0.20, 
    random_state=42,
)

In [None]:
tfid_names = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_names.fit_transform(X_train)
X_test = tfid_names.transform(X_test)

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
baseline_names = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="baseline_names_eval"
    )

baseline_names["training_time"] = training_time

In [None]:
df = baseline_names.copy()

## Baseline descriptions

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

In [None]:
tfid_descriptions = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_descriptions.fit_transform(X_train)
X_test = tfid_descriptions.transform(X_test)

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
baseline_description = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="baseline_description_eval"
    )

baseline_description["training_time"] = training_time

In [None]:
df = pd.concat([df, baseline_description])

## Name plus description

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    normalized_names_plus_description, y,
    test_size=0.20, 
    random_state=42,
)

In [None]:
tfid_descriptions = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_descriptions.fit_transform(X_train)
X_test = tfid_descriptions.transform(X_test)

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
names_plus_descriptions = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="names_plus_descriptions"
    )

names_plus_descriptions["training_time"] = training_time

In [None]:
df = pd.concat([df, baseline_description_eval])

## Name and descriptions with unique Tf-idf

In [None]:
name_train, name_test, y_train, y_test = train_test_split(
    normalized_names, y,
    test_size=0.20, 
    random_state=42,
)

description_train, description_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

# Vectorize
name_train = tfid_names.transform(name_train)
name_test = tfid_names.transform(name_test)
description_train = tfid_descriptions.transform(description_train) 
description_test = tfid_descriptions.transform(description_test)

# Set train and test
X_train = hstack((name_train, description_train))
X_test = hstack((name_test, description_test))

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
names_descriptions_different_tfidf = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="names_descriptions_different_tfidf"
    )
names_descriptions_different_tfidf["training_time"] = training_time

In [None]:
df = pd.concat([df, names_descriptions_different_tfidf])

## Name and descriptions with same Tf-idf

In [None]:
name_train, name_test, y_train, y_test = train_test_split(
    normalized_names, y,
    test_size=0.20, 
    random_state=42,
)

description_train, description_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

# Vectorize
name_train = tfid_names.transform(name_train)
name_test = tfid_names.transform(name_test)
description_train = tfid_names.transform(description_train) 
description_test = tfid_names.transform(description_test)

# Set train and test
X_train = hstack((name_train, description_train))
X_test = hstack((name_test, description_test))

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
names_descriptions_same_tfidf = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="names_descriptions_same_tfidf"
    )
names_descriptions_same_tfidf["training_time"] = training_time

In [None]:
df = pd.concat([df, names_descriptions_same_tfidf])

## Name with two different tfidf for descriptions

In [None]:
name_train, name_test, y_train, y_test = train_test_split(
    normalized_names, y,
    test_size=0.20, 
    random_state=42,
)

description_train, description_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

# Vectorize
name_train = tfid_names.transform(name_train)
name_test = tfid_names.transform(name_test)
description_train = tfid_names.transform(description_train) 
description_test = tfid_names.transform(description_test)

# Set train and test
X_train = hstack((name_train, description_train))
X_test = hstack((name_test, description_test))

description_train, description_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

description_train = tfid_descriptions.transform(description_train) 
description_test = tfid_descriptions.transform(description_test)

# Re-set train and test
X_train = hstack((X_train, description_train))
X_test = hstack((X_test, description_test))

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
names_two_descriptions = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="names_two_descriptions"
    )
names_two_descriptions["training_time"] = training_time

In [None]:
df = pd.concat([df, names_two_descriptions])

## Name with two different tfidf for descriptions and names plus descriptions

In [None]:
name_train, name_test, y_train, y_test = train_test_split(
    normalized_names, y,
    test_size=0.20, 
    random_state=42,
)

description_train, description_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

# Vectorize
name_train = tfid_names.transform(name_train)
name_test = tfid_names.transform(name_test)
description_train = tfid_names.transform(description_train) 
description_test = tfid_names.transform(description_test)

# Set train and test
X_train = hstack((name_train, description_train))
X_test = hstack((name_test, description_test))

description_train, description_test, y_train, y_test = train_test_split(
    normalized_descriptions, y,
    test_size=0.20, 
    random_state=42,
)

description_train = tfid_descriptions.transform(description_train) 
description_test = tfid_descriptions.transform(description_test)

# Re-set train and test
X_train = hstack((X_train, description_train))
X_test = hstack((X_test, description_test))

n_p_d_train, n_p_d_test, y_train, y_test = train_test_split(
    normalized_names_plus_description, y,
    test_size=0.20, 
    random_state=42,
)

n_p_d_train = tfid_names_plus_descriptions.transform(n_p_d_train)
n_p_d_test = tfid_names_plus_descriptions.transform(n_p_d_test)

# Re-set train and test
X_train = hstack((X_train, n_p_d_train))
X_test = hstack((X_test, n_p_d_test))

In [None]:
tree_classifier = DecisionTreeClassifier(random_state=42)

initialize_timer = time.time()
tree_classifier.fit(X_train, y_train)
training_time = time.time() - initialize_timer
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

test_predict = tree_classifier.predict(X_test)

In [None]:
all_together = evaluation.store_performance_in_df(
    pred_labels=test_predict,
    true_labels=y_test,
    average='micro',
    tree=tree_dict,
    index_name="all_together"
    )
all_together["training_time"] = training_time

In [None]:
df = pd.concat([df, all_together])

## Results

In [None]:
df