### Predict Quantity with 0.3 Test Size

In [9]:
import pandas as pd

Load the Final Cleaned Dataset

In [10]:
# Load the dataset from a CSV file
df = pd.read_csv('cleaned_data_final.csv')

# Display the first few rows of the DataFrame
print(df.head())

   Customer Id  Order Customer Id  Order Id  Order Item Cardprod Id  \
0        20755              20755     77202                    1360   
1        19492              19492     75939                    1360   
2        19491              19491     75938                    1360   
3        19490              19490     75937                    1360   
4        19489              19489     75936                    1360   

   Product Card Id  Product Price   Category Name Department city  \
0             1360         327.75  Sporting Goods          Caguas   
1             1360         327.75  Sporting Goods          Caguas   
2             1360         327.75  Sporting Goods        San Jose   
3             1360         327.75  Sporting Goods     Los Angeles   
4             1360         327.75  Sporting Goods          Caguas   

  Department country Customer Segment Department state  \
0        Puerto Rico         Consumer               PR   
1        Puerto Rico         Consumer     

In [11]:
print(df.dtypes)

Customer Id                  int64
Order Customer Id            int64
Order Id                     int64
Order Item Cardprod Id       int64
Product Card Id              int64
Product Price              float64
Category Name               object
Department city             object
Department country          object
Customer Segment            object
Department state            object
Customer Street             object
Order City                  object
Order Country               object
Order Region                object
Order State                 object
order date (DateOrders)     object
Product Name                object
Order Item Quantity          int64
Customer Zipcode           float64
dtype: object


## Predicting the Sales

In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Select relevant features and target variable
features = ['Category Name','Department city','Department country', 'Customer Id','Customer Segment',
            'Department state','Customer Street','Order City','Order Country','Order Customer Id','order date (DateOrders)','Order Id','Order Item Cardprod Id',
            'Order Region','Order State','Product Card Id','Product Name','Product Price','Customer Zipcode']
target = 'Order Item Quantity'

# Split the data into features (X) and target variable (y)
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define preprocessing steps (one-hot encoding for categorical variables)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', ['Customer Id','Order Customer Id','Order Id','Order Item Cardprod Id','Product Card Id','Product Price','Customer Zipcode']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Category Name', 'Department city', 'Department country','Customer Segment','Department state','Customer Street','Order City','Order Country','order date (DateOrders)','Order Region','Order State','Product Name'])
    ])


### Linear Regression 

In [13]:
from sklearn.linear_model import LinearRegression

# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 67.04523508925801
Mean Squared Error: 10292.36922110403
R-squared: 0.39309844649869563


### Random Forest 

In [14]:
from sklearn.ensemble import RandomForestRegressor

# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rn = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rn)
mse = mean_squared_error(y_test, y_pred_rn)
r2_rn = r2_score(y_test, y_pred_rn)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_rn}')

Mean Absolute Error: 57.90017743979721
Mean Squared Error: 10492.20040557668
R-squared: 0.38131516767443996


Save Random Forest Model

In [15]:
import pickle

# Save the trained model to a file using pickle
model_filename_pickle = 'randomforest_model_final.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_filename_pickle}')

Model saved to randomforest_model_final.pkl


### Decision Trees

In [16]:
from sklearn.tree import DecisionTreeClassifier


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', DecisionTreeClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_rf}')

Mean Absolute Error: 71.60456273764258
Mean Squared Error: 18893.78960709759
R-squared: -0.11409433705141914


### XGBoost 

In [17]:
import xgboost as xgb


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', xgb.XGBRegressor(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_n = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_n)
mse = mean_squared_error(y_test, y_pred_n)
r2_n = r2_score(y_test, y_pred_n)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_n}')

Mean Absolute Error: 58.19190518049233
Mean Squared Error: 9553.513766383256
R-squared: 0.43666591999775184


Save XGBoost Model

In [18]:
import pickle

# Save the trained model to a file using pickle
model_filename_pickle = 'xgbooost_model_final.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_filename_pickle}')

Model saved to xgbooost_model_final.pkl


In [24]:
import xgboost as xgb


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', xgb.XGBRegressor(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_n = model.predict(X_test)

# Round the predictions to the nearest integer
rounded_predictions = [round(pred) for pred in y_pred_n]

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_n)
mse = mean_squared_error(y_test, y_pred_n)
r2_n = r2_score(y_test, y_pred_n)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_n}')


Mean Absolute Error: 58.19190518049233
Mean Squared Error: 9553.513766383256
R-squared: 0.43666591999775184


Save XGBoost Model

In [25]:
import pickle

# Save the trained model to a file using pickle
model_filename_pickle = 'xgboost_model_final.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_filename_pickle}')

Model saved to xgboost_model_final.pkl


 ### Logistic Regression 

In [19]:
from sklearn.linear_model import LogisticRegression


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LogisticRegression(max_iter=5000))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_n = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_n)
mse = mean_squared_error(y_test, y_pred_n)
r2_n = r2_score(y_test, y_pred_n)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_n}')

Mean Absolute Error: 76.26869455006337
Mean Squared Error: 22787.325728770596
R-squared: -0.3436812348874956


### Ridge Regression 

In [None]:
from sklearn.linear_model import Ridge
# from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a Ridge Regression model
alpha = 1.0  # Regularization strength, adjust as needed
ridge_model = Ridge(alpha=alpha)

# Create a pipeline with preprocessing and the Ridge Regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', ridge_model)])


# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


## Inference

In [21]:
import pickle
import pandas as pd

# Load the model
with open('randomforest_model_final.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

df_test = pd.read_csv('test_dataset_final.csv')

# Make predictions using the loaded model
new_predictions = loaded_model.predict(df_test)

# Make predictions and print each prediction on a new line
for index, row in df_test.iterrows():
    # Assuming your model expects a 2D array, reshape the row
    # prediction = loaded_model.predict(row.values.reshape(1, -1))
    print(f'Prediction for row {index}: {new_predictions[index]}')

Prediction for row 0: 136.0
Prediction for row 1: 100.0
Prediction for row 2: 100.0
Prediction for row 3: 251.0
Prediction for row 4: 100.0
Prediction for row 5: 171.0
Prediction for row 6: 304.0
Prediction for row 7: 237.0
Prediction for row 8: 100.0
Prediction for row 9: 140.0
Prediction for row 10: 234.0
Prediction for row 11: 380.0
Prediction for row 12: 225.0
Prediction for row 13: 322.0
Prediction for row 14: 295.0
Prediction for row 15: 100.0
Prediction for row 16: 204.0
Prediction for row 17: 100.0
Prediction for row 18: 313.0
Prediction for row 19: 190.0


In [23]:
import pickle
import pandas as pd

# Load the model
with open('xgbooost_model_final.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

df_test = pd.read_csv('test_dataset_final.csv')

# Make predictions using the loaded model
new_predictions = loaded_model.predict(df_test)

# Make predictions and print each prediction on a new line
for index, row in df_test.iterrows():
    # Assuming your model expects a 2D array, reshape the row
    # prediction = loaded_model.predict(row.values.reshape(1, -1))
    print(f'Prediction for row {index}: {new_predictions[index]}')

Prediction for row 0: 255.23097229003906
Prediction for row 1: 100.27078247070312
Prediction for row 2: 100.27078247070312
Prediction for row 3: 274.0972595214844
Prediction for row 4: 101.22743225097656
Prediction for row 5: 236.32937622070312
Prediction for row 6: 286.96917724609375
Prediction for row 7: 302.649658203125
Prediction for row 8: 100.27078247070312
Prediction for row 9: 223.7647247314453
Prediction for row 10: 281.6730651855469
Prediction for row 11: 300.88470458984375
Prediction for row 12: 252.06251525878906
Prediction for row 13: 293.0944519042969
Prediction for row 14: 270.6311340332031
Prediction for row 15: 100.27078247070312
Prediction for row 16: 262.7432861328125
Prediction for row 17: 99.94042205810547
Prediction for row 18: 275.80047607421875
Prediction for row 19: 249.7384796142578


In [27]:
import pickle
import pandas as pd

# Load the model
with open('xgboost_model_final.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

df_test = pd.read_csv('test_dataset_final.csv')

# Make predictions using the loaded model
new_predictions = loaded_model.predict(df_test)

# Round the predictions to the nearest integer
rounded_predictions = [round(pred) for pred in new_predictions]

# Make predictions and print each prediction on a new line
for index, row in df_test.iterrows():
    # Assuming your model expects a 2D array, reshape the row
    # prediction = loaded_model.predict(row.values.reshape(1, -1))
    print(f'Prediction for row {index}: {rounded_predictions[index]}')

Prediction for row 0: 255
Prediction for row 1: 100
Prediction for row 2: 100
Prediction for row 3: 274
Prediction for row 4: 101
Prediction for row 5: 236
Prediction for row 6: 287
Prediction for row 7: 303
Prediction for row 8: 100
Prediction for row 9: 224
Prediction for row 10: 282
Prediction for row 11: 301
Prediction for row 12: 252
Prediction for row 13: 293
Prediction for row 14: 271
Prediction for row 15: 100
Prediction for row 16: 263
Prediction for row 17: 100
Prediction for row 18: 276
Prediction for row 19: 250
