# Predicting Product Condition: New vs Used

----------------------------------------------------
by Natalia López Gallego

This notebook performs trains a model to predict whether a product is new or used based on various seller and location features.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [4]:
# Load dataset
df = pd.read_csv('df_cleaned.csv')
df.head()

Unnamed: 0,condition,status,price,buying_mode,available_quantity,sold_quantity,accepts_mercadopago,automatic_relist,shipping_mode,address_state_name,title_length,address_city_name_freq,year_created,month_created,days_since_created,listing_duration,days_since_last_updated
0,1,active,80.0,buy_it_now,1,0,True,False,not_specified,Capital Federal,60,202,2015,9,3534,60,3534
1,0,active,2650.0,buy_it_now,1,0,True,False,me2,Capital Federal,57,3420,2015,9,3513,60,3513
2,0,active,60.0,buy_it_now,1,0,True,False,me2,Capital Federal,43,652,2015,9,3529,60,3529
3,1,active,580.0,buy_it_now,1,0,True,False,me2,Capital Federal,52,824,2015,9,3511,66,3504
4,0,active,30.0,buy_it_now,1,0,True,False,not_specified,Buenos Aires,25,230,2015,8,3546,60,3542


In [5]:
# Overview of the data
df.info()
df.describe(include='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   condition                100000 non-null  int64  
 1   status                   100000 non-null  object 
 2   price                    100000 non-null  float64
 3   buying_mode              100000 non-null  object 
 4   available_quantity       100000 non-null  int64  
 5   sold_quantity            100000 non-null  int64  
 6   accepts_mercadopago      100000 non-null  bool   
 7   automatic_relist         100000 non-null  bool   
 8   shipping_mode            100000 non-null  object 
 9   address_state_name       100000 non-null  object 
 10  title_length             100000 non-null  int64  
 11  address_city_name_freq   100000 non-null  int64  
 12  year_created             100000 non-null  int64  
 13  month_created            100000 non-null  int64  
 14  days_

Unnamed: 0,status,buying_mode,shipping_mode,address_state_name
count,100000,100000,100000,100000
unique,4,3,4,24
top,active,buy_it_now,me2,Capital Federal
freq,95675,97001,51255,57857


#  Data preparation

In [8]:
# 1. Splitting the data into training and testing sets

target_column = 'condition'
# Split features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Stratified train-test split to preserve class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify column types *after* the split to avoid leakage
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

We use `stratify=y` because out target is unbalanced, as seen previously in our EDA.

 ## Preprocessing Pipelines

In [9]:
# Numeric preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Define Models

In [10]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


## Model training and evaluation

In [None]:
results = {}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    if len(y.unique()) == 2:  # Binary classification
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        roc_auc = 'N/A'
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
    
    results[model_name] = {
        'Accuracy': accuracy,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }


## View results

In [None]:
results_df = pd.DataFrame(results).T
results_df