In [14]:
#Basic Libraries
import pandas as pd
import numpy as np

#Libraries for Data Viz
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for Data Preprocessing
from category_encoders import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#For Model Building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

#For obtaining performance metrics
from sklearn.metrics import mean_squared_error, accuracy_score,r2_score, f1_score,confusion_matrix, roc_curve, auc,recall_score, precision_score

import warnings
warnings.filterwarnings("ignore")


def label_encode(df, columns_to_le):
    """
    Method to encode categorical column using label encoder
    """
    le = LabelEncoder()
    for col in columns_to_le:
        df[col] = le.fit_transform(df[col])

def onehot_encode(df, oe_cols):
    """
    Method to encode categorical column using onehot encoder
    """
    oe = OneHotEncoder(cols = oe_cols)
    return oe.fit_transform(df)

def replace_with_mode(df, col_name, old_val):
    """
    Method to replace categorical column with mode value
    """
    mode = df[col_name].mode()[0]
    print('column: {}, value {} replacing with {}'.format(col_name, old_val, mode))
    df[col_name].replace(old_val, mode, inplace=True)


def cols_value_count(df, cols):
    for col in cols:
        print('===========================================')
        print('Columns: {}'.format(col))
        print(df[col].value_counts())

def print_unique_val(df):
    for i in df.columns:
        if df[i].dtype == type(object):
            print(i,end=': ')
            print('\n',df[i].unique())
            print()

#####################################################
# Clean / pre-process data
# Remove or impute missing values
# Check for outliers and take corrective action
# Check for any incorrect values
# Encode categorical / scale numerical features
#####################################################
def data_clean(df):
    # print_unique_val(df)
    # replace incorrect values * with mode
    replace_with_mode(df, 'operatingsystems','*')
    replace_with_mode(df, 'browser','*')
    replace_with_mode(df, 'traffictype','*')
    # Converting data type from String to Integer
    df = df.astype({'operatingsystems':np.int64, 'browser':np.int64, 'traffictype':np.int64})

    # print('============================Cleanup Data End========================================')
    # print_unique_val(df)
    
    # Encoding categorical columns start
    # Onehot encode columns: visitortype,weekend
    df = onehot_encode(df,['visitortype','weekend'])
    # Label encode columns: month
    label_encode(df,['month'])
    # print('========================= Label encode End =========================')
    # Encoding categorical columns end
    
    return df

train_file = 'ecommerce_train.csv'
test_file = 'ecommerce_test.csv'


In [7]:
df = pd.read_csv(train_file)

df = data_clean(df)
df.info()

column: operatingsystems, value * replacing with 2
column: browser, value * replacing with 2
column: traffictype, value * replacing with 2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   session_id               40000 non-null  int64  
 1   administrative           40000 non-null  int64  
 2   administrative_duration  40000 non-null  float64
 3   informational            40000 non-null  int64  
 4   informational_duration   40000 non-null  float64
 5   productrelated           40000 non-null  int64  
 6   productrelated_duration  40000 non-null  float64
 7   bouncerates              40000 non-null  float64
 8   exitrates                40000 non-null  float64
 9   pagevalues               40000 non-null  float64
 10  specialday               40000 non-null  float64
 11  month                    40000 non-null  int3

In [5]:
df.head()

Unnamed: 0,session_id,administrative,administrative_duration,informational,informational_duration,productrelated,productrelated_duration,bouncerates,exitrates,pagevalues,...,operatingsystems,browser,region,traffictype,visitortype_1,visitortype_2,visitortype_3,weekend_1,weekend_2,revenue
0,139957,2,47.5,0,0.0,19,1203.480714,0.0,0.027421,6.748509,...,2,2,3,2,1,0,0,1,0,0
1,496587,0,0.0,0,0.0,7,152.0,0.0,0.033333,0.0,...,3,2,6,3,1,0,0,1,0,0
2,440268,7,206.5,1,389.9,95,4415.889952,0.000133,0.03375,2.676062,...,2,2,1,20,1,0,0,1,0,0
3,294672,1,18.258571,0,0.0,103,8305.048706,0.004871,0.034746,6.824908,...,2,2,1,2,1,0,0,0,1,1
4,497475,0,0.0,0,0.0,42,1305.108333,0.016068,0.039742,0.0,...,3,2,5,2,1,0,0,1,0,0


In [11]:
# Split labeled data for train and test as 80% and 20%. 
# 20% labeled test data will be used for testing the model and performance measure

X = df.drop('revenue',axis=1)
y = df['revenue']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 0, stratify = y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32000, 21)
(8000, 21)
(32000,)
(8000,)


In [12]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_predict_lr = lr.predict(X_test)


In [17]:
acc_logreg = accuracy_score(y_test,y_predict_lr)
acc_logreg

0.87175