# Binary classifiers for sold Ebay shoe listings

## Connect to database and retrieve data

In [1]:
from sqlalchemy import create_engine
import pandas as pd
from decouple import config

DATABASE_URL = config('DATABASE_URL')
engine = create_engine(DATABASE_URL)

In [2]:
df = pd.read_sql_query('select * from "shoes"',con=engine)

## Data Cleaning

### Replace missing values with average value

In [3]:
price_fillna_value = round(df["price"].mean(),2)
free_shipping_fillna_value = int(df["free_shipping"].mean())
total_images_fillna_value = int(df["total_images"].mean())
seller_rating_fillna_value = int(df["seller_rating"].mean())
shoe_size_fillna_value = int(df["shoe_size"].mean())

df["price"].fillna(price_fillna_value,inplace=True)
df["free_shipping"].fillna(free_shipping_fillna_value,inplace=True)
df["total_images"].fillna(total_images_fillna_value,inplace=True)
df["seller_rating"].fillna(seller_rating_fillna_value,inplace=True)
df["shoe_size"].fillna(shoe_size_fillna_value,inplace=True)

## Define input and output features

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np

features = ['price','free_shipping', 'total_images', 'seller_rating', 'shoe_size', 'desc_fre_score', 'desc_avg_grade_score']

X = np.array(df[features])
y = np.array(df['sold'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Train Classification Models

### Logisitic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, f1_score


reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)
y_pred = reg_log.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("roc_auc_score: ", roc_auc_score(y_test, y_pred))
print("f1 score: ", f1_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.70      1.00      0.83        45
        True       0.00      0.00      0.00        19

    accuracy                           0.70        64
   macro avg       0.35      0.50      0.41        64
weighted avg       0.49      0.70      0.58        64

roc_auc_score:  0.5
f1 score:  0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

reg_rf = RandomForestClassifier()
reg_rf.fit(X_train, y_train)
y_pred = reg_rf.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("roc_auc_score: ", roc_auc_score(y_test, y_pred))
print("f1 score: ", f1_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.79      0.82      0.80        45
        True       0.53      0.47      0.50        19

    accuracy                           0.72        64
   macro avg       0.66      0.65      0.65        64
weighted avg       0.71      0.72      0.71        64

roc_auc_score:  0.647953216374269
f1 score:  0.5


In [20]:
feature_df = pd.DataFrame({'Importance':reg_rf.feature_importances_, 'Features': features })
print(feature_df)

   Importance              Features
0    0.172318                 price
1    0.028320         free_shipping
2    0.177162          total_images
3    0.181760         seller_rating
4    0.130010             shoe_size
5    0.174685        desc_fre_score
6    0.135746  desc_avg_grade_score


Given these feature importance values, a seller's rating has the most influence on the whether a shoe will sell, while free shipping has the least influence.

### SVM

In [21]:
from sklearn.svm import SVC

reg_svc = SVC()
reg_svc.fit(X_train, y_train)
y_pred = reg_svc.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("roc_auc_score: ", roc_auc_score(y_test, y_pred))
print("f1 score: ", f1_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.70      1.00      0.83        45
        True       0.00      0.00      0.00        19

    accuracy                           0.70        64
   macro avg       0.35      0.50      0.41        64
weighted avg       0.49      0.70      0.58        64

roc_auc_score:  0.5
f1 score:  0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### K-Nearest Neighbors

In [22]:
from sklearn.neighbors import KNeighborsClassifier

reg_knn = KNeighborsClassifier()
reg_knn.fit(X_train, y_train)
y_pred = reg_knn.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("roc_auc_score: ", roc_auc_score(y_test, y_pred))
print("f1 score: ", f1_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.75      0.91      0.82        45
        True       0.56      0.26      0.36        19

    accuracy                           0.72        64
   macro avg       0.65      0.59      0.59        64
weighted avg       0.69      0.72      0.68        64

roc_auc_score:  0.5871345029239766
f1 score:  0.35714285714285715
