### Predict Quantity with 0.2 Test Size

In [1]:
import pandas as pd

Load the Final Cleaned Dataset

In [2]:
# Load the dataset from a CSV file
df = pd.read_csv('cleaned_data_final.csv')

# Display the first few rows of the DataFrame
print(df.head())

   Customer Id  Order Customer Id  Order Id  Order Item Cardprod Id  \
0        20755              20755     77202                    1360   
1        19492              19492     75939                    1360   
2        19491              19491     75938                    1360   
3        19490              19490     75937                    1360   
4        19489              19489     75936                    1360   

   Product Card Id  Product Price   Category Name Department city  \
0             1360         327.75  Sporting Goods          Caguas   
1             1360         327.75  Sporting Goods          Caguas   
2             1360         327.75  Sporting Goods        San Jose   
3             1360         327.75  Sporting Goods     Los Angeles   
4             1360         327.75  Sporting Goods          Caguas   

  Department country Customer Segment Department state  \
0        Puerto Rico         Consumer               PR   
1        Puerto Rico         Consumer     

In [3]:
print(df.dtypes)

Customer Id                  int64
Order Customer Id            int64
Order Id                     int64
Order Item Cardprod Id       int64
Product Card Id              int64
Product Price              float64
Category Name               object
Department city             object
Department country          object
Customer Segment            object
Department state            object
Customer Street             object
Order City                  object
Order Country               object
Order Region                object
Order State                 object
order date (DateOrders)     object
Product Name                object
Order Item Quantity          int64
Customer Zipcode           float64
dtype: object


## Predicting the Sales

In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Select relevant features and target variable
features = ['Category Name','Department city','Department country', 'Customer Id','Customer Segment',
            'Department state','Customer Street','Order City','Order Country','Order Customer Id','order date (DateOrders)','Order Id','Order Item Cardprod Id',
            'Order Region','Order State','Product Card Id','Product Name','Product Price','Customer Zipcode']
target = 'Order Item Quantity'

# Split the data into features (X) and target variable (y)
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps (one-hot encoding for categorical variables)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', ['Customer Id','Order Customer Id','Order Id','Order Item Cardprod Id','Product Card Id','Product Price','Customer Zipcode']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Category Name', 'Department city', 'Department country','Customer Segment','Department state','Customer Street','Order City','Order Country','order date (DateOrders)','Order Region','Order State','Product Name'])
    ])


### Linear Regression 

In [5]:
from sklearn.linear_model import LinearRegression

# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 66.33492183841227
Mean Squared Error: 10238.743827446096
R-squared: 0.3989884172067957


### Random Forest 

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rn = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rn)
mse = mean_squared_error(y_test, y_pred_rn)
r2_rn = r2_score(y_test, y_pred_rn)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_rn}')

Mean Absolute Error: 58.16965779467681
Mean Squared Error: 10563.54927756654
R-squared: 0.37992242229895745


Save Random Forest Model

In [7]:
import pickle

# Save the trained model to a file using pickle
model_filename_pickle = 'random_forest_model_final.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_filename_pickle}')

Model saved to random_forest_model_final.pkl


### Decision Trees

In [8]:
from sklearn.tree import DecisionTreeClassifier


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', DecisionTreeClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_rf}')

Mean Absolute Error: 71.33840304182509
Mean Squared Error: 18770.342205323195
R-squared: -0.10181417452313646


### XGBoost 

In [9]:
import xgboost as xgb


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', xgb.XGBRegressor(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_n = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_n)
mse = mean_squared_error(y_test, y_pred_n)
r2_n = r2_score(y_test, y_pred_n)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_n}')

Mean Absolute Error: 58.64066216559465
Mean Squared Error: 9628.820126336634
R-squared: 0.4347907788211426


Save XGBoost Model

In [10]:
import pickle

# Save the trained model to a file using pickle
model_filename_pickle = 'xg_boost_model_final.pkl'
with open(model_filename_pickle, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_filename_pickle}')

Model saved to xg_boost_model_final.pkl


 ### Logistic Regression 

In [11]:
from sklearn.linear_model import LogisticRegression


# Create a pipeline with preprocessing and the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LogisticRegression(max_iter=5000))])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_n = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_n)
mse = mean_squared_error(y_test, y_pred_n)
r2_n = r2_score(y_test, y_pred_n)

# Print evaluation metrics
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_n}')

Mean Absolute Error: 76.49429657794677
Mean Squared Error: 22898.09885931559
R-squared: -0.34411240971786894


### Ridge Regression 

In [14]:
from sklearn.linear_model import Ridge
# from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a Ridge Regression model
alpha = 1.0  # Regularization strength, adjust as needed
ridge_model = Ridge(alpha=alpha)

# Create a pipeline with preprocessing and the Ridge Regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', ridge_model)])


# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


Mean Squared Error: 10953.723654914102
Mean Absolute Error: 74.64213303298928
R-squared: 0.35701928847248066


## Inference

In [14]:

import pickle
import pandas as pd

# Load the model
with open('random_forest_model_final.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

df_test = pd.read_csv('test_dataset_final.csv')

# Make predictions using the loaded model
new_predictions = loaded_model.predict(df_test)

# Make predictions and print each prediction on a new line
for index, row in df_test.iterrows():
    # Assuming your model expects a 2D array, reshape the row
    # prediction = loaded_model.predict(row.values.reshape(1, -1))
    print(f'Prediction for row {index}: {new_predictions[index]}')


Prediction for row 0: 143.0
Prediction for row 1: 100.0
Prediction for row 2: 100.0
Prediction for row 3: 243.0
Prediction for row 4: 100.0
Prediction for row 5: 179.0
Prediction for row 6: 351.0
Prediction for row 7: 265.0
Prediction for row 8: 100.0
Prediction for row 9: 143.0
Prediction for row 10: 241.0
Prediction for row 11: 349.0
Prediction for row 12: 213.0
Prediction for row 13: 330.0
Prediction for row 14: 233.0
Prediction for row 15: 100.0
Prediction for row 16: 200.0
Prediction for row 17: 100.0
Prediction for row 18: 333.0
Prediction for row 19: 227.0


In [16]:

import pickle
import pandas as pd

# Load the model
with open('xg_boost_model_final.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

df_test = pd.read_csv('test_dataset_final.csv')

# Make predictions using the loaded model
new_predictions = loaded_model.predict(df_test)

# Make predictions and print each prediction on a new line
for index, row in df_test.iterrows():
    # Assuming your model expects a 2D array, reshape the row
    # prediction = loaded_model.predict(row.values.reshape(1, -1))
    print(f'Prediction for row {index}: {new_predictions[index]}')

Prediction for row 0: 251.1293182373047
Prediction for row 1: 100.12848663330078
Prediction for row 2: 100.55577087402344
Prediction for row 3: 272.1960144042969
Prediction for row 4: 101.29574584960938
Prediction for row 5: 238.38914489746094
Prediction for row 6: 286.91607666015625
Prediction for row 7: 275.3374328613281
Prediction for row 8: 100.55577087402344
Prediction for row 9: 226.0751190185547
Prediction for row 10: 281.60845947265625
Prediction for row 11: 302.4366760253906
Prediction for row 12: 253.9191436767578
Prediction for row 13: 296.2969665527344
Prediction for row 14: 271.0020446777344
Prediction for row 15: 100.10680389404297
Prediction for row 16: 262.846435546875
Prediction for row 17: 99.60985565185547
Prediction for row 18: 280.8624267578125
Prediction for row 19: 295.2327880859375
