# Predicting Product Condition: New vs Used

----------------------------------------------------
by Natalia López Gallego

This notebook performs trains a model to predict whether a product is new or used based on various seller and location features.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from xgboost import XGBClassifier


In [1]:
# Load dataset
df = pd.read_csv('df_cleaned.csv')
df.head()

NameError: name 'pd' is not defined

In [5]:
# Overview of the data
df.info()
df.describe(include='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   condition                100000 non-null  int64  
 1   status                   100000 non-null  object 
 2   price                    100000 non-null  float64
 3   buying_mode              100000 non-null  object 
 4   available_quantity       100000 non-null  int64  
 5   sold_quantity            100000 non-null  int64  
 6   accepts_mercadopago      100000 non-null  bool   
 7   automatic_relist         100000 non-null  bool   
 8   shipping_mode            100000 non-null  object 
 9   address_state_name       100000 non-null  object 
 10  tags                     75090 non-null   object 
 11  sub_status               986 non-null     object 
 12  title_length             100000 non-null  int64  
 13  address_city_name_freq   100000 non-null  int64  
 14  year_

Unnamed: 0,status,buying_mode,shipping_mode,address_state_name,tags,sub_status
count,100000,100000,100000,100000,75090,986
unique,4,3,4,24,7,3
top,active,buy_it_now,me2,Capital Federal,dragged_bids_and_visits,suspended
freq,95675,97001,51255,57857,72551,966


In [9]:
# 1. Splitting the data into training and testing sets
X = df.drop(columns=['condition']) 
y = df['condition']  # Replace with your target column

X_encoded = pd.get_dummies(X, drop_first=True)  # Convert all string columns

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [None]:

# 2. Train multiple classification models

models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True), 
    'XGBoost': XGBClassifier()
}

results = {}

for model_name, model in models.items():
  
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1] 
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1_score = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
    roc_auc = roc_auc_score(y_test, y_pred_prob) if len(y_pred_prob.shape) == 1 else 'N/A'  
    # Results
    results[model_name] = {
        'Accuracy': accuracy,
        'F1-Score': f1_score,
        'ROC-AUC': roc_auc
    }

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# 3. Printing the results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# 4. Saving the  DataFrames
# Saving the training and testing data, and predictions
# Crear un DataFrame de los resultados

X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)