In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder



from sklearn.ensemble import RandomForestRegressor

# define dataset
df = pd.read_csv('C:/Users/jack/Documents/EbayMarketScraping/Scraping_bs4_ebay-main/02-11-23_clean_output8719row.csv')

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Remove irrelevant columns
df.drop(['title', 'sold_date', 'link', 'seller notes', 'series', 'operating system', 'type', 'condition', 'processor', 'features', 'item number'], axis=1, inplace=True)

# Handle missing values
df.dropna(inplace=True)

# Split the dataset into training and testing sets
X = df.drop('price', axis=1)
y = df['price']

# Convert the 'price' variable to a binary variable
threshold = 50
y = np.where(y > threshold, 1, 0)

# summarize class distribution
print(Counter(y))
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)
# summarize class distribution
print(Counter(y_over))


# Convert categorial variables with label encoder
categorical_cols = ['brand', 'processor i series', 'processor generation', 'storage type','gpu', 'model']
le = LabelEncoder()

# apply label encoder on categorical feature columns
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=1, stratify=y_over)

# Create a random forest regressor
reg = RandomForestRegressor(random_state=42)

# Train the regressor
reg.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = reg.predict(X_test)

# Calculate the metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2 score: {r2}')

# Add the results to the dataframe
results = {'Model': 'Random Forest Regressor',
           'MSE': mse,
           'RMSE': rmse,
           'R^2': r2}


# Create a dataframe from the dictionary
output = pd.DataFrame([results])
print(output.head())

Counter({1: 8574, 0: 141})
Counter({1: 8574, 0: 8574})
MSE: 0.0041415782288882754
RMSE: 0.06435509481686959
R^2 score: 0.9834336870844469
                     Model       MSE      RMSE       R^2
0  Random Forest Regressor  0.004142  0.064355  0.983434


# RandomizedSearchCV RFR model

In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, explained_variance_score
from sklearn.metrics import mean_squared_error, r2_score
import json
import numpy as np
import os

# Define the hyperparameters to tune
param_dist = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}



# Define custom scoring functions
def mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

# Define the scoring metrics
scoring = {
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'EVS': make_scorer(explained_variance_score),
    'MSE': make_scorer(mse),
    'RMSE': make_scorer(rmse),
    'R2': make_scorer(r2)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=reg, param_distributions=param_dist, scoring=scoring, refit='MAE', n_iter=10, cv=5, verbose=2, random_state=42)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the results of the best model
best_results = random_search.cv_results_

# Convert numpy arrays to lists
for key in best_results.keys():
    if isinstance(best_results[key], np.ndarray):
        best_results[key] = best_results[key].tolist()

# Log the performance of each model
log_file = 'model_performance_log.json'
data = []

if os.path.exists(log_file):
    try:
        with open(log_file, 'r') as file:
            data = json.load(file)
    except json.JSONDecodeError:
        print("JSONDecodeError encountered. Initialized data as an empty list.")

data.append(best_results)
with open(log_file, 'w') as file:
    json.dump(data, file)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   7.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   6.5s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   6.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   7.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   6.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estim

# Display results of training

In [1]:
import plotly.express as px

# Create a dataframe with the actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Create a scatter plot
fig = px.scatter(results_df, x='Actual', y='Predicted', title='Actual vs Predicted')

# Add a diagonal line for reference
fig.add_shape(type='line', x0=results_df['Actual'].min(), y0=results_df['Actual'].min(),
              x1=results_df['Actual'].max(), y1=results_df['Actual'].max(),
              line=dict(color='red', width=2, dash='dash'))

# Show the plot
fig.show()


ModuleNotFoundError: No module named 'plotly'

# Train and save best performing model as checkpoint

In [None]:
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

# Load the log file
with open('model_performance_log.json', 'r') as file:
    data = json.load(file)

# Get the best hyperparameters
best_hyperparameters = random_search.best_params_

# Initialize a new model with the best hyperparameters
model = RandomForestRegressor(**best_hyperparameters)

# Fit the model on your data
model.fit(X_train, y_train)

# Display the performance metrics
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2 score: {r2}')


# Save the model
dump(model, 'model_checkpoint.joblib')

MSE: 0.0036185125148577133
RMSE: 0.06015407313605383
R^2 score: 0.9855259499405692


['model_checkpoint.joblib']