In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('train_with_quality_label.csv')

X_text = df["catalog_content"]
X_price = df[["price"]]
y = df["quality_label"]

X_text_train, X_text_val, X_price_train, X_price_val, y_train, y_val = train_test_split(
    X_text,
    X_price,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [6]:
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words="english"
)

X_text_train_vec = tfidf.fit_transform(X_text_train)
X_text_val_vec = tfidf.transform(X_text_val)

In [8]:
scaler = StandardScaler()

X_price_train_scaled = scaler.fit_transform(X_price_train)
X_price_val_scaled = scaler.transform(X_price_val)

X_train = hstack([X_text_train_vec, X_price_train_scaled])
X_val = hstack([X_text_val_vec, X_price_val_scaled])

In [12]:
model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced"
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.87      0.91      8531
           1       0.64      0.75      0.69      3123
           2       0.84      0.90      0.87      3346

    accuracy                           0.85     15000
   macro avg       0.81      0.84      0.82     15000
weighted avg       0.87      0.85      0.86     15000



In [15]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=-1,
    num_leaves=64,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

lgb_model.fit(X_train, y_train)

y_pred_lgb = model.predict(X_val)
print(classification_report(y_val, y_pred_lgb))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.561114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1201451
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 28346
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
              precision    recall  f1-score   support

           0       0.96      0.87      0.91      8531
           1       0.64      0.75      0.69      3123
           2       0.84      0.90      0.87      3346

    accuracy                           0.85     15000
   macro avg       0.81      0.84      0.82     15000
weighted avg       0.87      0.85      0.86     15000



Image Embeddings.

In [16]:
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
import requests
from io import BytesIO


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = resnet50(pretrained=True)
model.fc = torch.nn.Identity()   # remove classifier
model = model.to(device)
model.eval()




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\mayan/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:26<00:00, 3.84MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [18]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


In [19]:
def get_image_embedding(url):
    try:
        response = requests.get(url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)

        with torch.no_grad():
            emb = model(img).cpu().numpy().flatten()
        return emb
    except:
        return None


In [27]:
sample_df = df.sample(10000, random_state=42)

image_embeddings = []
valid_idx = []

for idx, url in zip(sample_df.index, sample_df["image_link"]):
    emb = get_image_embedding(url)
    if emb is not None:
        image_embeddings.append(emb)
        valid_idx.append(idx)


In [28]:
img_df = df.loc[valid_idx].reset_index(drop=True)
img_emb = np.vstack(image_embeddings)

text_vec = tfidf.transform(img_df["catalog_content"])
price_scaled = scaler.transform(img_df[["price"]])

X_final = hstack([
    text_vec,
    price_scaled,
    img_emb
])

y_final = img_df["quality_label"]



In [29]:
X_train, X_val, y_train, y_val = train_test_split(
    X_final,
    y_final,
    test_size=0.2,
    stratify=y_final,
    random_state=42
)

model_final = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

model_final.fit(X_train, y_train)

y_pred = model_final.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1138
           1       0.42      0.50      0.45       424
           2       0.63      0.64      0.63       438

    accuracy                           0.69      2000
   macro avg       0.63      0.64      0.64      2000
weighted avg       0.71      0.69      0.70      2000

