# Predicting Product Condition: New vs Used

----------------------------------------------------
by Natalia López Gallego

This notebook performs trains a model to predict whether a product is new or used based on various seller and location features.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:

# Load dataset
df = pd.read_csv('products.csv')
df.head()


In [None]:

# Overview of the data
df.info()
df.describe(include='object')


In [None]:

# Clean warranty
df['warranty'] = df['warranty'].fillna('no_info')
df['warranty'] = df['warranty'].apply(lambda x: 1 if str(x).lower() in ['yes', 'true'] else 0)

# Convert coordinates
df['seller_address_latitude'] = pd.to_numeric(df['seller_address_latitude'], errors='coerce')
df['seller_address_longitude'] = pd.to_numeric(df['seller_address_longitude'], errors='coerce')


In [None]:

drop_cols = [
    'seller_address_comment', 'seller_address_address_line',
    'seller_address_zip_code', 'seller_address_id', 'seller_address_city_id',
    'seller_address_state_id', 'seller_address_country_id',
    'seller_address_search_location_neighborhood_id',
    'seller_address_search_location_city_id',
    'seller_address_search_location_state_id'
]
df.drop(columns=drop_cols, inplace=True)


In [None]:

# Check correlations
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:

categorical_cols = [
    'seller_address_city_name', 'seller_address_state_name',
    'seller_address_country_name',
    'seller_address_search_location_neighborhood_name',
    'seller_address_search_location_city_name',
    'seller_address_search_location_state_name',
    'sub_status'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:

# Define X and y
X = df.drop(columns=['condition'])
y = df['condition']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
