Get data

In [4]:
import pandas as pd

# Load a CSV file
data = pd.read_csv('D:/ecommerce_model/final_df.csv')

# Display the first few rows

data.head()

Unnamed: 0,event_type,category_code,brand,user_session,product_id,category_id,price,user_id
0,view,appliances.environment.water_heater,aqua,9333dfbd-b87a-4708-9857-6336556b0fcc,-0.175876,-0.172473,-0.838759,1.120309
1,view,computers.notebook,lenovo,7c90fc70-0e80-4590-96f3-13c02c18c713,-0.486412,-0.172472,-0.265366,0.867643
2,view,electronics.smartphone,apple,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,-0.522669,-0.172472,1.91297,0.105015
3,view,computers.desktop,pulser,0d0d91c2-c9c2-4e81-90a5-86594dec0db9,-0.465635,-0.172472,1.458118,-1.138904
4,view,apparel.shoes.keds,baden,ac1cd4e5-a3ce-4224-a2d7-ff660a105880,2.795478,-0.172472,-0.656383,-0.717832


Split Data Into Train and Test Set

In [5]:

X = data.drop(columns = 'event_type', axis=1)
Y = data['event_type']

In [6]:
data.shape

(26534109, 8)

In [7]:
X.head()

Unnamed: 0,category_code,brand,user_session,product_id,category_id,price,user_id
0,appliances.environment.water_heater,aqua,9333dfbd-b87a-4708-9857-6336556b0fcc,-0.175876,-0.172473,-0.838759,1.120309
1,computers.notebook,lenovo,7c90fc70-0e80-4590-96f3-13c02c18c713,-0.486412,-0.172472,-0.265366,0.867643
2,electronics.smartphone,apple,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,-0.522669,-0.172472,1.91297,0.105015
3,computers.desktop,pulser,0d0d91c2-c9c2-4e81-90a5-86594dec0db9,-0.465635,-0.172472,1.458118,-1.138904
4,apparel.shoes.keds,baden,ac1cd4e5-a3ce-4224-a2d7-ff660a105880,2.795478,-0.172472,-0.656383,-0.717832


In [8]:
Y.head()

0    view
1    view
2    view
3    view
4    view
Name: event_type, dtype: object

In [9]:
X_train, X_test, y_train, y_test = X[:20000000], X[20000000:], Y[:20000000], Y[20000000:]

In [10]:
print(X.shape, X_train.shape, X_test.shape)

(26534109, 7) (20000000, 7) (6534109, 7)


Training Models

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

final_df = data.copy()

X = data.drop(columns=['event_type'])
y = data['event_type']


categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['number']).columns


numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(
        sparse_output=True,
        handle_unknown='ignore',
        dtype=np.float32  
    ))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(
        loss='hinge',
        penalty='l2',
        alpha=0.0001,
        max_iter=1000,
        random_state=42
    ))
])


sample_size = min(10000, len(X))
svm_pipeline.fit(X[:sample_size], y[:sample_size])

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(
        loss='hinge',           
        alpha=0.0001,           
        max_iter=1000,          
        random_state=42,
        tol=1e-3                
    ))
])

sample_size = min(10000, len(X)) 
X_sample = X.sample(sample_size, random_state=42)
y_sample = y.loc[X_sample.index]

sgd_pipeline.fit(X_sample, y_sample)  

In [None]:
from sklearn.linear_model import LogisticRegression

logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l2',           
        C=1.0,                 
        solver='lbfgs',        
        max_iter=1000,         
        random_state=42,
        class_weight='balanced' 
    ))
])

sample_size = min(10000, len(X))
X_sample = X.sample(sample_size, random_state=42)
y_sample = y.loc[X_sample.index]

logreg_pipeline.fit(X_sample, y_sample)

Performance Measures

Measuring Accuracy Score

In [None]:
from sklearn.metrics import accuracy_score

X_train_prediction = sgd_pipeline.predict(X_sample)
training_data_accuracy_sgd = accuracy_score(X_train_prediction, y_sample)
print('Accuracy score of the train data : ', training_data_accuracy_sgd)

Accuracy score of the train data :  0.9543


In [None]:

X_train_prediction = sgd_pipeline.predict(X_sample)  
training_data_accuracy_svm = accuracy_score(y_sample, X_train_prediction)  
print('Accuracy score of the train data:', training_data_accuracy_svm)

Accuracy score of the train data: 0.9543


In [None]:
from sklearn.metrics import accuracy_score
X_train_prediction = logreg_pipeline.predict(X_sample)
training_data_accuracy_log_reg= accuracy_score(X_train_prediction, y_sample)
print('Accuracy score of the train data : ', training_data_accuracy_log_reg)

Accuracy score of the train data :  0.9998


Measuring Accuracy Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_pipeline,  
                X_sample, 
                y_sample,
                cv=3,    
                scoring="accuracy" 
                )

array([0.95110978, 0.95109511, 0.95139514])

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(svm_pipeline,  
                X_sample,  
                y_sample,
                cv=3,     
                scoring="accuracy" 
                )

array([0.95110978, 0.95109511, 0.95139514])

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(logreg_pipeline,  
                X_sample,  
                y_sample,
                cv=3,    
                scoring="accuracy" 
                )

array([0.94661068, 0.94389439, 0.94659466])

Test Best Model

In [None]:

X_test_prediction = sgd_pipeline.predict(X_sample)
test_data_accuracy_sgd = accuracy_score(X_test_prediction, y_sample)
print('Accuracy score of the test data : ', test_data_accuracy_sgd)

Accuracy score of the test data :  0.9543
