In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load existing training data
design_within_reach_mapped = pd.read_csv('Design Within Reach.csv')
discount_school_supply_mapped = pd.read_csv('Discount_School_Supply.csv')

# Load new manually mapped retailer data
# new_retailer_data = pd.read_csv('new_retailer_data_mapped.csv')

# Combine the existing data with the new data
combined_data = pd.concat([design_within_reach_mapped, discount_school_supply_mapped], ignore_index=True)

# Preprocess the combined data
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = ''.join(e for e in text if e.isalnum() or e.isspace())
        return text
    else:
        return ""

combined_data['src_pt'] = combined_data['src_pt'].apply(preprocess_text)
combined_data['src_cat'] = combined_data['src_cat'].apply(preprocess_text)
combined_data['src_sc'] = combined_data['src_sc'].apply(preprocess_text)

# Encode the labels
le_pt = LabelEncoder()
le_cat = LabelEncoder()
le_sc = LabelEncoder()

combined_data['ent_pt_2_encoded'] = le_pt.fit_transform(combined_data['ent_pt_2'])
combined_data['ent_cat_2_encoded'] = le_cat.fit_transform(combined_data['ent_cat_2'])
combined_data['ent_sc_2_encoded'] = le_sc.fit_transform(combined_data['ent_sc_2'])

In [3]:
# Split the data into train and test sets
X_train, X_test, y_train_pt, y_test_pt = train_test_split(combined_data[['src_pt', 'src_cat', 'src_sc']],
                                                          combined_data['ent_pt_2_encoded'], test_size=0.3, random_state=42)

# Use TfidfVectorizer with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Train Gradient Boosting Classifier
pipeline_pt = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5]
}

grid_search = GridSearchCV(pipeline_pt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train['src_pt'], y_train_pt)



In [4]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Predict with the best model
best_pipeline_pt = grid_search.best_estimator_
y_pred_pt = best_pipeline_pt.predict(X_test['src_pt'])

# Get the unique classes in the test set
unique_test_classes = np.unique(y_test_pt)

# Transform these indices back to class labels and filter out any non-string labels
target_names = [label for label in le_pt.inverse_transform(unique_test_classes) if isinstance(label, str)]

print("Best parameters found:", grid_search.best_params_)
print("Product Type Classification Report")

# Use zero_division=0 to handle undefined metrics (divisions by zero)
print(classification_report(y_test_pt, y_pred_pt, labels=unique_test_classes, target_names=target_names, zero_division=0))
print("Accuracy:", accuracy_score(y_test_pt, y_pred_pt))

# Save the updated model and encoder for future use
joblib.dump(best_pipeline_pt, 'ent_pt_2_gradient_boosting_model.pkl')
joblib.dump(le_pt, 'ent_pt_2_label_encoder.pkl')

print("Model retrained with Gradient Boosting and saved.")


Best parameters found: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Product Type Classification Report
                          precision    recall  f1-score   support

             Accessories       0.00      0.00      0.00         2
               Baby Gear       0.00      0.00      0.00        23
           Bath and Body       0.00      0.00      0.00         6
                   Books       0.00      0.00      0.00        33
Crafts & School Supplies       0.61      0.71      0.66       193
             Electronics       0.36      0.33      0.35        12
                    Home       0.72      0.69      0.70       278
         Office Supplies       0.00      0.00      0.00        40
     Sports & Recreation       0.00      0.00      0.00         4
                    Toys       0.77      0.87      0.82       655

                accuracy                           0.72      1248
               macro avg       0.22      0.24      0

