In [1]:
# install missing packages
%pip install nltk
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3 (from nltk)
  Obtaining dependency information for regex>=2021.8.3 from https://files.pythonhosted.org/packages/79/33/67c4ed826f5227655225c3feaaecd15afb8453e827334ddae95a7fba07ac/regex-2023.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached regex-2023.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Using cached regex-2023.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (776 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.10.3
Note: you may need to restart the kernel to use updated packages.
Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/df/79/c80174d711ee26ee5da55a9cc3e248f1ec7a0188b5e4d6bbbbcd09b974b0/plotly-5.17.0-py2.py3-none-any.whl.metadata
  Using cached plotly-5.17.0-py2.py3-none-any.whl.metadata (7.0 k

In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 14.15 GB
Used RAM: 1.06 GB
Percentage Usage Of RAM: 8.5%
CPU Cores: 4
CPU Speed: 2.5000 GHz
Total Disk: 24.99 GB
Available Disk: 16.74 GB
Used Disk: 8.25 GB
Percentage Usage Of Disk: 33.0%


In [3]:
# import requirements
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from wrangle import prepare
from gbm_regressor import Regression

[nltk_data] Downloading package vader_lexicon to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# get the data
sales = pd.read_csv("Amazon.csv")

In [5]:
# prepare the data for machine learning
sales = prepare(df=sales, name="Data Preparation", path=None, plots=True)

Data Wrangling:
> Removing Unnecessary Columns
> Transforming Product Name
> Transforming Category
> Transforming Actual Price
> Transforming Discount Percentage
> Transforming Rating
> Transforming Rating Count
> Computing About Product Positivity
> Computing Review Title Positivity
> Computing Review Content Positivity
> Aggregating By Product ID
> Shuffling The Data
6.21 Seconds
Plotting:
> Plotting Correlations
> actual_price vs. discount_percentage
> actual_price vs. rating
> actual_price vs. rating_count
> actual_price vs. about_product_positivity
> actual_price vs. review_title_positivity
> actual_price vs. review_content_positivity
> discount_percentage vs. rating
> discount_percentage vs. rating_count
> discount_percentage vs. about_product_positivity
> discount_percentage vs. review_title_positivity
> discount_percentage vs. review_content_positivity
> rating vs. rating_count
> rating vs. about_product_positivity
> rating vs. review_title_positivity
> rating vs. review_conten

In [6]:
# get the testing data
y = sales[["actual_price"]]
X = sales.drop(columns="actual_price")
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [7]:
# build the model
print("\n---- Amazon Sales Regression Analysis ----\n")
model = Regression(
    name="XGBoost Without Feature Engineering", 
    path=None,
    rename=False, 
    time=False, 
    binary=True, 
    imputation=True, 
    variance=True,
    atwood=False,
    binning=False,
    reciprocal=False, 
    interaction=False, 
    selection=False,
    plots=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.validate(X, y)  # build the machine learning pipeline
    predictions = model.predict(testX)
    print("\nModel Performance:")
    print(f"> R2: {model.r2}")
    print(f"> RMSE: {model.rmse}")
    print(f"> In Control: {model.in_control}")


---- Amazon Sales Regression Analysis ----

Model Training:
> Transforming The Training Data
> Transforming Categorical Features
> Filling In Missing Values
> Removing Constant Features
> Training XGBoost
1.06 Seconds
Model Performance:
> Transforming The Testing Data
> Scoring The Model
1.84 Seconds
Model Indicators:
> Extracting Important Features
0.08 Seconds
Model Prediction:
> Transforming The New Data
> Getting Predictions
0.02 Seconds
Model Monitoring:
> Computing Feature Drift
0.36 Seconds

Model Performance:
> R2: 0.38338389829530023
> RMSE: 9149.020772725184
> In Control: 94.8%


In [8]:
# score the model
rmse = mean_squared_error(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    squared=False,
)
r2 = r2_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 9295.156400993468
R2: 0.43234084618227797


In [9]:
# model diagnostics
print("Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10]):
    print(f"{i+1}. {indicator}")
print(" ")
print("Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10]):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Indicators:
1. category_Electronics
2. product_name_ZORBES®
3. product_name_LG
4. product_name_Sony
5. product_name_Samsung
6. review_content_positivity
7. product_name_TCL
8. product_name_Amozo
9. product_name_OnePlus
10. product_name_VU
 
Feature Drift:
1. actual_price


In [10]:
# save the machine learning pipeline
model.dump()

In [11]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The Updated Data
> Transforming Categorical Features
> Filling In Missing Values
> Removing Constant Features
> Training XGBoost
1.67 Seconds
Model Indicators:
> Extracting Important Features
0.13 Seconds
