Steps to Improve the Model:
Data Preparation:

Use your manually mapped datasets to create a labeled dataset for training the model.
Feature Engineering:

Instead of using raw text similarity (TF-IDF and cosine similarity), create features that can better represent the relationship between source and target categories. This could include embedding-based features, custom text similarity features, etc.
Model Training:

Train a supervised learning model using the labeled data. Models such as Random Forests, Gradient Boosting Machines, or even deep learning models like LSTMs or Transformers could be useful, depending on the size and complexity of your data.
Model Evaluation:

Evaluate the trained model's performance using a validation set. Use metrics like accuracy, F1-score, or other relevant metrics to assess how well the model is performing.
Prediction and Mapping:

Use the trained model to predict the mappings for any new data.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Load the manually mapped datasets
design_within_reach_mapped = pd.read_csv('Design Within Reach.csv')
discount_school_supply_mapped = pd.read_csv('Discount_School_Supply.csv')

# Combine the datasets for training
combined_data = pd.concat([design_within_reach_mapped, discount_school_supply_mapped])

# Preprocessing function to clean text data
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()
        # Remove special characters
        text = ''.join(e for e in text if e.isalnum() or e.isspace())
        return text
    else:
        return ''

# Apply preprocessing
combined_data['src_pt'] = combined_data['src_pt'].apply(preprocess_text)
combined_data['src_cat'] = combined_data['src_cat'].apply(preprocess_text)
combined_data['src_sc'] = combined_data['src_sc'].apply(preprocess_text)

# Combine the text features
combined_data['combined'] = combined_data['src_pt'] + ' ' + combined_data['src_cat'] + ' ' + combined_data['src_sc']

# Encode target labels (productType, category, subCategory)
le_pt = LabelEncoder()
le_cat = LabelEncoder()
le_sc = LabelEncoder()

combined_data['ent_pt_2_encoded'] = le_pt.fit_transform(combined_data['ent_pt_2'])
combined_data['ent_cat_2_encoded'] = le_cat.fit_transform(combined_data['ent_cat_2'])
combined_data['ent_sc_2_encoded'] = le_sc.fit_transform(combined_data['ent_sc_2'])


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train_pt, y_test_pt, y_train_cat, y_test_cat, y_train_sc, y_test_sc = train_test_split(
    combined_data['combined'], 
    combined_data['ent_pt_2_encoded'], 
    combined_data['ent_cat_2_encoded'], 
    combined_data['ent_sc_2_encoded'], 
    test_size=0.2, 
    random_state=42
)

# Create a pipeline for text processing and classification
pipeline_pt = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_cat = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_sc = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the models
pipeline_pt.fit(X_train, y_train_pt)
pipeline_cat.fit(X_train, y_train_cat)
pipeline_sc.fit(X_train, y_train_sc)


In [4]:
import numpy as np

# Predict and evaluate productType
y_pred_pt = pipeline_pt.predict(X_test)

# Find the unique classes in y_test_pt
unique_classes_pt = np.unique(y_test_pt)

# Ensure that the unique classes are within the range of the label encoder's classes
valid_classes_pt = unique_classes_pt[unique_classes_pt < len(le_pt.classes_)]

# Obtain target names for valid classes only
valid_target_names_pt = le_pt.inverse_transform(valid_classes_pt)

# Filter out any non-string types (e.g., NaN, floats)
valid_target_names_pt = [name for name in valid_target_names_pt if isinstance(name, str)]

print("Product Type Classification Report")
print(classification_report(y_test_pt, y_pred_pt, labels=valid_classes_pt, target_names=valid_target_names_pt))
print("Accuracy:", accuracy_score(y_test_pt, y_pred_pt))

# Predict and evaluate category
y_pred_cat = pipeline_cat.predict(X_test)
unique_classes_cat = np.unique(y_test_cat)
valid_classes_cat = unique_classes_cat[unique_classes_cat < len(le_cat.classes_)]
valid_target_names_cat = le_cat.inverse_transform(valid_classes_cat)
valid_target_names_cat = [name for name in valid_target_names_cat if isinstance(name, str)]

print("\nCategory Classification Report")
print(classification_report(y_test_cat, y_pred_cat, labels=valid_classes_cat, target_names=valid_target_names_cat))
print("Accuracy:", accuracy_score(y_test_cat, y_pred_cat))

# Predict and evaluate subCategory
y_pred_sc = pipeline_sc.predict(X_test)
unique_classes_sc = np.unique(y_test_sc)
valid_classes_sc = unique_classes_sc[unique_classes_sc < len(le_sc.classes_)]
valid_target_names_sc = le_sc.inverse_transform(valid_classes_sc)
valid_target_names_sc = [name for name in valid_target_names_sc if isinstance(name, str)]

print("\nSubCategory Classification Report")
print(classification_report(y_test_sc, y_pred_sc, labels=valid_classes_sc, target_names=valid_target_names_sc))
print("Accuracy:", accuracy_score(y_test_sc, y_pred_sc))


Product Type Classification Report
                          precision    recall  f1-score   support

             Accessories       0.00      0.00      0.00         2
               Baby Gear       0.88      0.50      0.64        14
           Bath and Body       1.00      0.20      0.33         5
                   Books       0.73      0.61      0.67        18
Crafts & School Supplies       0.84      0.79      0.81       131
             Electronics       0.67      0.20      0.31        10
                    Home       0.79      0.82      0.80       196
         Office Supplies       0.73      0.46      0.56        24
     Sports & Recreation       0.00      0.00      0.00         1
                    Toys       0.84      0.92      0.88       429

                accuracy                           0.83       832
               macro avg       0.59      0.41      0.45       832
            weighted avg       0.82      0.83      0.82       832

Accuracy: 0.8269230769230769

Category

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Predict the new categories for new data using the trained models
def predict_new_data(new_data, pipeline_pt, pipeline_cat, pipeline_sc):
    # Preprocess new data similarly
    new_data['src_pt'] = new_data['src_pt'].apply(preprocess_text)
    new_data['src_cat'] = new_data['src_cat'].apply(preprocess_text)
    new_data['src_sc'] = new_data['src_sc'].apply(preprocess_text)
    
    # Combine text columns into a single feature
    new_data['combined'] = new_data['src_pt'] + ' ' + new_data['src_cat'] + ' ' + new_data['src_sc']
    
    # Predict the categories
    new_data['ent_pt_2_pred'] = le_pt.inverse_transform(pipeline_pt.predict(new_data['combined']))
    new_data['ent_cat_2_pred'] = le_cat.inverse_transform(pipeline_cat.predict(new_data['combined']))
    new_data['ent_sc_2_pred'] = le_sc.inverse_transform(pipeline_sc.predict(new_data['combined']))
    
    return new_data

# Example usage with a new dataset
new_data = pd.read_csv('Halloween Costumes.csv')
predicted_data = predict_new_data(new_data, pipeline_pt, pipeline_cat, pipeline_sc)

# Save the predicted results
predicted_data.to_csv('Halloween Costumes_mapped.csv', index=False)
