In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
import scipy.stats


In [3]:
data = pd.read_csv('sales_sample_cleaned.csv')

In [4]:
data.head()

Unnamed: 0,region,country,item_type,sales_channel,order_priority,order_date,ship_date,units_sold,unit_price,unit_cost,total_revenue,total_cost,total_profit,shipping_duration_in_days,shipping_duration_category
0,Australia and Oceania,Federated States of Micronesia,Meat,Online,L,2020-05-08,2020-05-28,1836,421.89,364.69,774590.04,669570.84,105019.2,20,more than 5 days
1,Sub-Saharan Africa,Tanzania,Office Supplies,Online,H,2019-03-11,2019-04-10,3388,651.21,524.96,2206299.48,1778564.48,427735.0,30,more than 5 days
2,Europe,United Kingdom,Beverages,Online,H,2014-05-07,2014-05-18,6381,47.45,31.79,302778.45,202851.99,99926.46,11,more than 5 days
3,Sub-Saharan Africa,Mozambique,Cosmetics,Offline,H,2016-10-18,2016-11-19,8696,437.2,263.33,3801891.2,2289917.68,1511973.52,32,more than 5 days
4,Sub-Saharan Africa,Tanzania,Meat,Online,L,2021-05-14,2021-06-20,2486,421.89,364.69,1048818.54,906619.34,142199.2,37,more than 5 days


In [5]:
data.columns

Index(['region', 'country', 'item_type', 'sales_channel', 'order_priority',
       'order_date', 'ship_date', 'units_sold', 'unit_price', 'unit_cost',
       'total_revenue', 'total_cost', 'total_profit',
       'shipping_duration_in_days', 'shipping_duration_category'],
      dtype='object')

In [6]:
catergorical_columns = ['region',
                        'country',
                        'item_type',
                        'sales_channel',
                        'shipping_duration_category'
                        ]  

ordinal_columns = ['order_priority']                     

numerical_columns = ['units_sold',
                     'unit_cost',
                     'unit_price',
                     'total_revenue',
                     'total_cost'
                     ]

target = 'total_profit'

In [7]:
data.drop(columns=['order_date', 'ship_date'], inplace=True)

In [8]:
data.columns

Index(['region', 'country', 'item_type', 'sales_channel', 'order_priority',
       'units_sold', 'unit_price', 'unit_cost', 'total_revenue', 'total_cost',
       'total_profit', 'shipping_duration_in_days',
       'shipping_duration_category'],
      dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   region                      10000 non-null  object 
 1   country                     10000 non-null  object 
 2   item_type                   10000 non-null  object 
 3   sales_channel               10000 non-null  object 
 4   order_priority              10000 non-null  object 
 5   units_sold                  10000 non-null  int64  
 6   unit_price                  10000 non-null  float64
 7   unit_cost                   10000 non-null  float64
 8   total_revenue               10000 non-null  float64
 9   total_cost                  10000 non-null  float64
 10  total_profit                10000 non-null  float64
 11  shipping_duration_in_days   10000 non-null  int64  
 12  shipping_duration_category  10000 non-null  object 
dtypes: float64(5), int64(2), object(

### Feature Scaling

In [11]:
scaler = MinMaxScaler()
scaling_preprocessor = ColumnTransformer(
    transformers=[
        ('min_max', scaler, numerical_columns),
    ],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')

### Dummy Variables

In [12]:
oh_encoder = OneHotEncoder(sparse_output=False, drop='first',handle_unknown='ignore')
onehot_preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', oh_encoder, catergorical_columns),
    ],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')
  
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_encoder, ordinal_columns),
    ],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')
  

### Training and testing data

In [13]:
X = data.drop(['total_profit'], axis=1)
y = data[['total_profit']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Linear Regression

In [None]:
reg_pipe = Pipeline([
    ('scaling', scaling_preprocessor),
    ('onehot', onehot_preprocessor),
    ('ordinal', ordinal_preprocessor),
    ('model', LinearRegression())
    
])

reg_pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred = reg_pipe.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

Mean Squared Error: 3.128045623591764e-17


In [17]:
### Save the model
import pickle

with open('linear_regression_pipeline.pkl', 'wb') as f:
    pickle.dump(reg_pipe, f)