In [2]:
# Import dataset and describe characteristics such as dimensions, data types, file types, and import methods used


import pandas as pd

df = pd.read_csv("online_shoppers_intention.csv") 
# df.head()

# count of rows with null values per column
df.isnull().sum()


Administrative               0
Administrative_Duration      0
Informational              128
Informational_Duration       0
ProductRelated               0
ProductRelated_Duration      0
BounceRates                  0
ExitRates                    0
PageValues                 135
SpecialDay                   0
Month                        0
OperatingSystems           123
Browser                      0
Region                       0
TrafficType                  0
VisitorType                  0
Weekend                      0
Revenue                      0
dtype: int64

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("online_shoppers_intention.csv")

# Basic structure
#df.head()
#df.shape
df.info()
df.describe(include="all")

# Missing value summary
df.isnull().sum()

# Number of rows with at least one missing value
df[df.isnull().any(axis=1)].shape

# View samples of missing rows
# df[df.isnull().any(axis=1)].head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12202 non-null  float64
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12195 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12207 non-null  float64
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

(382, 18)

In [4]:
# Clean, wrangle, and handle missing data


import pandas as pd
# impute median values
missing_numerical = ['Informational', 'PageValues']
for col in missing_numerical:
    df[col] = df[col].fillna(df[col].median())

# impute categorical using mode
df['OperatingSystems'] = df['OperatingSystems'].fillna(df['OperatingSystems'].mode()[0])

# export to new csv
df.to_csv("online_shoppers_intention_cleaned.csv", index=False)

# confirm no more nulls
# df.isnull().sum()


In [5]:
#Transform data appropriately using techniques such as aggregation, normalization, and feature construction


import pandas as pd


# combine _duration columns (aggregation)
df['Total_Duration'] = (
    df['Administrative_Duration'] +
    df['Informational_Duration'] +
    df['ProductRelated_Duration']
)


# z scaling (normalization)
standard_cols = ['BounceRates', 'ExitRates', 'PageValues']

means = df[standard_cols].mean()
stds = df[standard_cols].std(ddof=0)  # ddof=0 matches sklearn's StandardScaler

# apply Z-score scaling: (x - mean) / std
df[standard_cols] = (df[standard_cols] - means) / stds

# verify z scaling
#check = pd.DataFrame({
#    'mean': df[standard_cols].mean(),
#    'std': df[standard_cols].std(ddof=0)
#})
#check

# feature construction

# classify engagement level based on newly aggregated duration value
df['Engagement_Level'] = pd.cut(
    df['Total_Duration'],
    bins=[0, 60, 300, 1200, df['Total_Duration'].max()],
    labels=['Very Low', 'Low', 'Medium', 'High'],
    include_lowest=True
)



