In [33]:
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [34]:
df = pd.read_csv("Chocolate_Sales.csv")
df.head()

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04/01/2022,"$5,320.00",180
1,Van Tuxwell,India,85% Dark Bars,01/08/2022,"$7,896.00",94
2,Gigi Bohling,India,Peanut Butter Cubes,07/07/2022,"$4,501.00",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27/04/2022,"$12,726.00",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24/02/2022,"$13,685.00",184


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3282 entries, 0 to 3281
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Sales Person   3282 non-null   object
 1   Country        3282 non-null   object
 2   Product        3282 non-null   object
 3   Date           3282 non-null   object
 4   Amount         3282 non-null   object
 5   Boxes Shipped  3282 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 154.0+ KB


In [36]:
df['Amount'].unique()

array(['$5,320.00', '$7,896.00', '$4,501.00', ..., '$616.09', '$2,504.62',
       '$5,915.87'], shape=(3013,), dtype=object)

In [37]:
df['Amount'] = (df['Amount'].astype(str)
                .str.replace(",", "", regex=False)
                .str.replace("$", "", regex=False)
                .str.strip())
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

In [38]:
df['Amount'].dtype

dtype('float64')

In [39]:
df['Amount']

0        5320.00
1        7896.00
2        4501.00
3       12726.00
4       13685.00
          ...   
3277     5303.58
3278     7339.32
3279      616.09
3280     2504.62
3281     5915.87
Name: Amount, Length: 3282, dtype: float64

In [40]:
df.isna().sum()

Sales Person     0
Country          0
Product          0
Date             0
Amount           0
Boxes Shipped    0
dtype: int64

In [41]:
df["Date"] = pd.to_datetime(df['Date'], errors='coerce')


In [42]:
df = df.dropna(subset=["Date"])

In [43]:
df["Date"]

0      2022-04-01
1      2022-01-08
2      2022-07-07
5      2022-06-06
9      2022-04-07
          ...    
3266   2024-12-08
3267   2024-11-03
3269   2024-12-05
3270   2024-12-01
3278   2024-07-06
Name: Date, Length: 1290, dtype: datetime64[ns]

In [44]:
df['Date'].isna().sum()

np.int64(0)

In [45]:
threshold = df["Amount"].quantile(0.70)
df['Big_Sale'] = (df['Amount'] >= threshold).astype(int)

In [46]:
df.head()

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped,Big_Sale
0,Jehu Rudeforth,UK,Mint Chip Choco,2022-04-01,5320.0,180,0
1,Van Tuxwell,India,85% Dark Bars,2022-01-08,7896.0,94,1
2,Gigi Bohling,India,Peanut Butter Cubes,2022-07-07,4501.0,91,0
5,Van Tuxwell,India,Smooth Sliky Salty,2022-06-06,5376.0,38,0
9,Brien Boise,Australia,99% Dark & Pure,2022-04-07,2835.0,102,0


In [47]:
df = df.drop('Amount', axis=1)

In [48]:
df.head()

Unnamed: 0,Sales Person,Country,Product,Date,Boxes Shipped,Big_Sale
0,Jehu Rudeforth,UK,Mint Chip Choco,2022-04-01,180,0
1,Van Tuxwell,India,85% Dark Bars,2022-01-08,94,1
2,Gigi Bohling,India,Peanut Butter Cubes,2022-07-07,91,0
5,Van Tuxwell,India,Smooth Sliky Salty,2022-06-06,38,0
9,Brien Boise,Australia,99% Dark & Pure,2022-04-07,102,0


In [49]:
df["Date"].dtype

dtype('<M8[ns]')

In [50]:
df['Month'] = df['Date'].dt.month
df['Weekday'] = df['Date'].dt.weekday

In [51]:
df = df.drop("Date", axis=1)

In [52]:
df.head()

Unnamed: 0,Sales Person,Country,Product,Boxes Shipped,Big_Sale,Month,Weekday
0,Jehu Rudeforth,UK,Mint Chip Choco,180,0,4,4
1,Van Tuxwell,India,85% Dark Bars,94,1,1,5
2,Gigi Bohling,India,Peanut Butter Cubes,91,0,7,3
5,Van Tuxwell,India,Smooth Sliky Salty,38,0,6,0
9,Brien Boise,Australia,99% Dark & Pure,102,0,4,3


In [53]:
df = pd.get_dummies(df, columns=['Sales Person', 'Country', 'Product'], drop_first=True)

In [54]:
df.head()

Unnamed: 0,Boxes Shipped,Big_Sale,Month,Weekday,Sales Person_Barr Faughny,Sales Person_Beverie Moffet,Sales Person_Brien Boise,Sales Person_Camilla Castle,Sales Person_Ches Bonnell,Sales Person_Curtice Advani,...,Product_Manuka Honey Choco,Product_Milk Bars,Product_Mint Chip Choco,Product_Orange Choco,Product_Organic Choco Syrup,Product_Peanut Butter Cubes,Product_Raspberry Choco,Product_Smooth Sliky Salty,Product_Spicy Special Slims,Product_White Choc
0,180,0,4,4,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,94,1,1,5,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,91,0,7,3,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
5,38,0,6,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
9,102,0,4,3,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [55]:
X = df.drop('Big_Sale', axis=1)

joblib.dump(X.columns.tolist(), "feature_names.pkl")
y = df['Big_Sale']

In [56]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [58]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [59]:
y_pred = model.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.686046511627907
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       174
           1       0.54      0.26      0.35        84

    accuracy                           0.69       258
   macro avg       0.63      0.58      0.57       258
weighted avg       0.66      0.69      0.65       258



In [60]:
joblib.dump(model, "big_sale_model.pkl")

['big_sale_model.pkl']

In [61]:
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']