In [74]:
import streamlit as st
import pandas as pd
from sqlalchemy import create_engine
import plotly.graph_objects as go
import os
import json
import plotly.express as px
import numpy as np

def load_credentials(path = "aws_rds_credentials.json"):
     with open(path, 'r') as file:
          config = json.load(file)

     # set up credentials
     for key in config.keys():
          os.environ[key] = config[key]

     return


load_credentials()

aws_rds_url = f"postgresql://{os.environ['user']}:{os.environ['password']}@{os.environ['host']}:{os.environ['port']}/{os.environ['database']}?sslmode=require"

# Load a sample dataset
def load_data():
    engine = create_engine(aws_rds_url)
    sql_query = f"""SELECT * 
                    FROM public.tracking_staging 
                    WHERE date >= CURRENT_DATE - INTERVAL '30 days'
                    LIMIT 20000
                        """
    df = pd.read_sql(sql_query, engine)
    return (df)

data = load_data()
data = data[["size_title", "color1_id", "brand_title", "price_numeric", "status"]]
data

Unnamed: 0,size_title,color1_id,brand_title,price_numeric,status
0,S / 36 / 8,23.0,,3,Muito bom
1,S / 36 / 8,1.0,Even&Odd,3,Muito bom
2,L / 40 / 12,3.0,Zara Basic,8,Novo sem etiquetas
3,L,9.0,Piazza Italia,1,Bom
4,L,1.0,Specchio,2,Muito bom
...,...,...,...,...,...
19995,XXL / 44 / 16,1.0,Jennyfer,10,Novo sem etiquetas
19996,S / 36 / 8,1.0,Bershka,7,Muito bom
19997,XS / 34 / 6,3.0,Jennyfer,6,Muito bom
19998,XS / 34 / 6,20.0,Bershka,5,Novo sem etiquetas


In [75]:
data.isnull().sum()

size_title          0
color1_id        2701
brand_title         0
price_numeric       0
status              0
dtype: int64

In [76]:
data = data.fillna(value = -100)
data.isnull().sum()

size_title       0
color1_id        0
brand_title      0
price_numeric    0
status           0
dtype: int64

In [77]:
from sklearn.model_selection import train_test_split
# LightGBM also can handle categorical data directly We go to probe its inner method

# Transform categorical features into the appropriate type that is expected by LightGBM
for c in data.columns:
    col_type = data[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        data[c] = data[c].astype('category')


In [78]:
X_train, X_test, y_train, y_test = train_test_split(data, 
                                                    data["price_numeric"], 
                                                    test_size=0.2, 
                                                    random_state=42)
X_train, y_train

(                 size_title color1_id  ... price_numeric              status
 5894            L / 40 / 12      20.0  ...             5           Muito bom
 3728                      L       4.0  ...           100           Muito bom
 8958            M / 38 / 10       1.0  ...             8                 Bom
 7671            L / 40 / 12      27.0  ...            12  Novo com etiquetas
 5999             S / 36 / 8       9.0  ...             4                 Bom
 ...                     ...       ...  ...           ...                 ...
 11284       8 anos / 128 cm       3.0  ...             4  Novo sem etiquetas
 11964  Prematuro, até 44 cm      -100  ...            19                 Bom
 5390          Tamanho único      27.0  ...             4  Novo sem etiquetas
 860             M / 38 / 10       9.0  ...            20  Novo sem etiquetas
 15795         Tamanho único      -100  ...            10           Muito bom
 
 [16000 rows x 5 columns],
 5894       5
 3728     100
 8958  

In [85]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

d_train=lgb.Dataset(X_train, label=y_train)

# Define parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',  # Root Mean Squared Error
}

clf=lgb.train(params,
              d_train) 

# Prediction on the valid set
y_pred=clf.predict(X_test)

y_pred = clf.predict(X_test)
y_pred = y_pred.astype(np.int32)
errors = y_pred - y_test


mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 5
[LightGBM] [Info] Start training from score 16.412500
Mean Squared Error: 213.88975


In [86]:
pd.DataFrame([y_pred, y_test]).T

Unnamed: 0,0,1
0,69,70
1,10,10
2,49,50
3,11,12
4,7,7
...,...,...
3995,17,18
3996,14,15
3997,5,5
3998,17,17
