In [60]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import pickle

In [61]:
file_path = 'C:/Users/user/Untitled Folder 2/Data_Mitra_Kel_8.xlsx'
df = pd.read_excel(file_path)

In [62]:
# Define numerical and categorical features
numerical_features = ['Order Quantity', 'Unit Price ']
categorical_features = ['Category', 'Delivery Location', 'Product Detail (type, material, color, size)', 'Price Category', 'Customer Name', 'Gender']

In [63]:
# Split the data into features and target variable
X = df.drop(columns=['Buy Decision'])  # Replace 'Target Column' with your actual target column name
y = df['Buy Decision']  # Replace 'Target Column' with your actual target column name

In [64]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [65]:
# Concatenate the training and testing data
X_concatenated = pd.concat([X_train, X_test], axis=0)

In [66]:
# Initialize scalers and encoders
scaler = StandardScaler()
onehot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

In [67]:
# Fit and transform the scaler and encoder on the concatenated data
X_concatenated_numerical = scaler.fit_transform(X_concatenated[numerical_features])
X_concatenated_categorical = onehot_encoder.fit_transform(X_concatenated[categorical_features])



In [68]:
# Split the transformed data back into training and testing sets
X_train_numerical = X_concatenated_numerical[:len(X_train)]
X_test_numerical = X_concatenated_numerical[len(X_train):]

In [69]:
X_train_categorical = X_concatenated_categorical[:len(X_train)]
X_test_categorical = X_concatenated_categorical[len(X_train):]

In [70]:
# Combine the processed numerical and categorical features
X_train_processed = pd.DataFrame(X_train_numerical, columns=numerical_features).join(pd.DataFrame(X_train_categorical, columns=onehot_encoder.get_feature_names_out(categorical_features)))
X_test_processed = pd.DataFrame(X_test_numerical, columns=numerical_features).join(pd.DataFrame(X_test_categorical, columns=onehot_encoder.get_feature_names_out(categorical_features)))


In [71]:
# Train the model
best_model_simplified = RandomForestClassifier()  # Use RandomForestClassifier
best_model_simplified.fit(X_train_processed, y_train)

In [72]:
# Save the model
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model_simplified, model_file)

In [73]:
# Save the scaler and encoder
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
with open('onehot_encoder.pkl', 'wb') as encoder_file, open('onehot_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(onehot_encoder, encoder_file)

In [74]:
print(df)

     Order Quantity Category                      Delivery Location  \
0                20      Oil                  Batam, Kepulauan Riau   
1                 2      Oil             Palembang, Sumatra Selatan   
2                 1      Oil  Tanjung Balai Karimun, Kepulauan Riau   
3                 1      Oil                  Batam, Kepulauan Riau   
4                 4      Oil                        Pekanbaru, Riau   
..              ...      ...                                    ...   
291               9      Oil                  Batam, Kepulauan Riau   
292               1      Oil                        Pekanbaru, Riau   
293               9  Battery                  Batam, Kepulauan Riau   
294              62      Oil                  Batam, Kepulauan Riau   
295              27     Tyre  Tanjung Balai Karimun, Kepulauan Riau   

    Product Detail (type, material, color, size)  Unit Price  Price Category  \
0                                 Oli Mobil @Dus        85000      