In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cleaned-ecommerce-data/2019-Dec-Cleaned.csv
/kaggle/input/cleaned-ecommerce-data/2019-Nov-Cleaned.csv


Combine Dataset and Preview

In [6]:
# Combine all CSV files in the input directory
file_paths = []
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        if filename.endswith('.csv'):
            file_paths.append(os.path.join(dirname, filename))

# Load and combine all CSV files into a single DataFrame
combined_df = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)

# Display the first few rows of the combined data
print("Combined Data Preview:")
print(combined_df.head())


Combined Data Preview:
                event_time event_type  product_id          category_id  \
0  2019-12-01 00:00:00 UTC       view     1005105  2232732093077520756   
1  2019-12-01 00:00:01 UTC       view     2402273  2232732100769874463   
2  2019-12-01 00:00:02 UTC       view    20100164  2232732110089618156   
3  2019-12-01 00:00:02 UTC       view   100008256  2053013561185141473   
4  2019-12-01 00:00:03 UTC       view     1005239  2232732093077520756   

                  category_code   brand    price    user_id  \
0      construction.tools.light   apple  1302.48  556695836   
1  appliances.personal.massager   bosch   313.52  539453785   
2              apparel.trousers    nika   101.68  517987650   
3          accessories.umbrella    ikea   163.56  542860793   
4      construction.tools.light  xiaomi   256.38  525740700   

                           user_session  
0  ca5eefc5-11f9-450c-91ed-380285a0bc80  
1  5ee185a7-0689-4a33-923d-ba0130929a76  
2  906c6ca8-ff5c-419a-bde9-

BRANDS AND COUNT

In [4]:
# List all unique brands
unique_brands = df_combined['brand'].unique()

# Display the unique brands
print("Unique Brands:")
for brand in unique_brands:
    print(brand)

Unique Brands:
xiaomi
janome
lg
hp
apple
samsung
huawei
acer
asus
nika
acme
tefal
oasis
bosch
tissot
goodloot
lenovo
dewalt
denzel
mamadoma
canon
oppo
ams
fisherman
palit
indesit
avatar
laston
magnetta
epson
beko
pioneer
dwt
garmin
arg
brw
pablosky
stanley
casio
jordan
fubag
armani
alphard
element
gorenje
komfort-s
artel
domini
bradex
defender
steelseries
oneplus
fly
stels
disney
matrix
irbis
garvalin
infinity
nexpero
hertz
midea
asrock
llorens
arnica
hotpoint-ariston
biomecanics
barneo
cersanit
masstone
vivo
phantom
nokia
belecoo
smile
meizu
aqua
sv
toshiba
kenwood
gigabyte
phoenix
graphite
gezatone
ariston
sony
rooman
inoi
elenberg
suunto
huter
whirlpool
dion
honor
respect
karya
orient
haier
stendmebel
tp-link
atlant
burgerschuhe
ryobi
dell
etor
cube
nakamichi
amd
gt
jinga
anytek
nike
citizen
zeta
fassen
tcl
vitek
yamaha
elari
changhong
powercolor
redmond
milavitsa
texet
rieker
shivaki
cortland
intel
catunltd
turbo
zte
obsessive
spur
harper
scarlett
asics
neoline
salamander
resanta
r

In [5]:
# Count the number of unique brands
brand_count = len(unique_brands)

# Display the count and the unique brands
print(f"Number of Unique Brands: {brand_count}")

Number of Unique Brands: 3515


Feature Engineering

In [7]:
# Feature 1: Total Spent (assuming 'price' exists)
combined_df['total_spent'] = combined_df['price']

# Feature 2: Extract Date-Time Features
combined_df['event_time'] = pd.to_datetime(combined_df['event_time'])  # Ensure 'event_time' is datetime
combined_df['hour'] = combined_df['event_time'].dt.hour
combined_df['day'] = combined_df['event_time'].dt.day
combined_df['month'] = combined_df['event_time'].dt.month

# Feature 3: Aggregate User Data
user_agg = combined_df.groupby('user_id').agg(
    avg_spent_per_user=('total_spent', 'mean'),
    num_sessions=('user_session', 'nunique')  # Example: count unique sessions per user
).reset_index()

# Save the processed features for later use (optional)
user_agg.to_csv('/kaggle/working/processed_data.csv', index=False)

XG BOOST 

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
combined_df['brand_encoded'] = le.fit_transform(combined_df['brand'])


In [14]:
from sklearn.model_selection import train_test_split

# Features and target
features = ['price', 'hour', 'day', 'month', 'avg_spent_per_user', 'num_sessions']
target = 'brand_encoded'

X = df_merged[features]
y = df_merged[target]

# Train-test split without stratify
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train XGBoost model
num_classes = combined_df['brand_encoded'].nunique()  # Calculate from the entire dataset

# Identify missing classes in y_train
all_classes = set(range(num_classes))
train_classes = set(y_train.unique())
missing_classes = list(all_classes - train_classes)

# Add dummy rows for missing classes
for cls in missing_classes:
    X_train = pd.concat([X_train, X_train.iloc[:1]], ignore_index=True)  # Duplicate one existing row
    y_train = pd.concat([y_train, pd.Series([cls])], ignore_index=True)

In [1]:
# Initialize and train the model
model = XGBClassifier(objective="multi:softmax", num_class=num_classes)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=LabelEncoder().inverse_transform(range(num_classes))))

NameError: name 'XGBClassifier' is not defined