In [1]:
# Import libraries
import numpy as np
import pandas as pd

In [2]:
# Specify file to csv file
file_path = 'datasets/1. online_shoppers_intention.csv'

In [3]:
# Load dataset
data = pd.read_csv(file_path)

# Display the head of the dataframe
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
data.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


## Let's make the dataset "dirty" by introducing missing values, duplicate rows, inconsistent data types, inconsistent categorical data

In [6]:
# Set seed for reproducability
np.random.seed(42)

In [7]:
# Missing Values

# Randomly set 5% of the values in column 'PageValues" as NaN
data['PageValues'] = data['PageValues'].mask(np.random.random(data.shape[0]) < 0.05)

In [8]:
# Duplcate rows

# Randomly set the number of rows to duplicate (from 1 to 100 rows)
n_rows_to_duplicate = np.random.randint(1, 101)

# Randomly select indices to duplicate (while ensuring that it does not exceed the number of available rows in the dataset)
rows_to_duplicate = np.random.choice(data.index, size=min(n_rows_to_duplicate, len(data)), replace=False)

# Create an empty list to hold the duplicate rows
duplicates = []

for index in rows_to_duplicate:
    # Randomly decide how many times to duplicate the current row (between 1 to 3 times)
    n_duplicates = np.random.randint(1, 4)
    # Append duplicates to the duplicates list
    duplicates.extend([data.loc[index]] * n_duplicates)

# Convert the list of DataFrames to a single DataFrame
duplicates = pd.DataFrame(duplicates).reset_index(drop=True)

# Concatenate the original data with the duplicates
data = pd.concat([data, duplicates], ignore_index=True)

# Display DataFrame
data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12488,0,0.0,0,0.0,56,4377.983333,0.004286,0.022296,0.0,0.8,May,4,1,1,4,Returning_Visitor,False,False
12489,0,0.0,0,0.0,56,4377.983333,0.004286,0.022296,0.0,0.8,May,4,1,1,4,Returning_Visitor,False,False
12490,5,77.0,0,0.0,94,3210.848218,0.002128,0.020922,0.0,0.0,Oct,2,2,3,2,Returning_Visitor,True,False
12491,5,77.0,0,0.0,94,3210.848218,0.002128,0.020922,0.0,0.0,Oct,2,2,3,2,Returning_Visitor,True,False


In [9]:
# Inconsistent data types

# Dictionary to map numbers to words (up to 20)
number_to_word = {
    1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five',
    6: 'six', 7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten',
    11: 'eleven', 12: 'twelve', 13: 'thirteen', 14: 'fourteen', 15: 'fifteen',
    16: 'sixteen', 17: 'seventeen', 18: 'eighteen', 19: 'nineteen', 20: 'twenty'
}

# Randomly select 5% of the indices from the DataFrame
indices_to_modify = np.random.choice(data.index, size=int(len(data) * 0.05), replace=False)

# Replace the numeric values with their word equivalents for randomly selected indices
for idx in indices_to_modify:
    if data.loc[idx, 'TrafficType'] in number_to_word:
        data.at[idx, 'TrafficType'] = number_to_word[data.loc[idx, 'TrafficType']]

# Display the modified DataFrame
data

  data.at[idx, 'TrafficType'] = number_to_word[data.loc[idx, 'TrafficType']]


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.0,0.0,Feb,3,3,1,four,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12488,0,0.0,0,0.0,56,4377.983333,0.004286,0.022296,0.0,0.8,May,4,1,1,4,Returning_Visitor,False,False
12489,0,0.0,0,0.0,56,4377.983333,0.004286,0.022296,0.0,0.8,May,4,1,1,4,Returning_Visitor,False,False
12490,5,77.0,0,0.0,94,3210.848218,0.002128,0.020922,0.0,0.0,Oct,2,2,3,2,Returning_Visitor,True,False
12491,5,77.0,0,0.0,94,3210.848218,0.002128,0.020922,0.0,0.0,Oct,2,2,3,2,Returning_Visitor,True,False


In [20]:
# Inconsistent categorical data

# Dictionary for month transformations
month_transform = {
    'Feb': 'february',
    'May': 'Mayy',
    'Oct': 'october'
}

# Filter for rows where 'Month' is 'Feb', 'May', or 'Oct'
eligible_indices = data[data['Month'].isin(['Feb', 'May', 'Oct'])].index

# Randomly select 5% of the eligible indices
indices_to_modify = np.random.choice(eligible_indices, size=int(len(eligible_indices) * 0.05), replace=False)

# Apply the changes
for idx in indices_to_modify:
    current_month = data.loc[idx, 'Month']
    if current_month in month_transform:
        data.at[idx, 'Month'] = month_transform[current_month]

# Display the modified DataFrame
data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.0,0.0,Feb,3,3,1,four,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12488,0,0.0,0,0.0,56,4377.983333,0.004286,0.022296,0.0,0.8,May,4,1,1,4,Returning_Visitor,False,False
12489,0,0.0,0,0.0,56,4377.983333,0.004286,0.022296,0.0,0.8,May,4,1,1,4,Returning_Visitor,False,False
12490,5,77.0,0,0.0,94,3210.848218,0.002128,0.020922,0.0,0.0,Oct,2,2,3,2,Returning_Visitor,True,False
12491,5,77.0,0,0.0,94,3210.848218,0.002128,0.020922,0.0,0.0,october,2,2,3,2,Returning_Visitor,True,False


In [22]:
# check dataframe
data.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region
count,12493.0,12493.0,12493.0,12493.0,12493.0,12493.0,12493.0,12493.0,11867.0,12493.0,12493.0,12493.0,12493.0
mean,2.309773,80.349079,0.502601,34.338417,31.701273,1192.826452,0.022215,0.04314,5.850627,0.061522,2.126231,2.35748,3.154166
std,3.325365,175.923745,1.267123,140.065823,44.351149,1907.068647,0.048426,0.048567,18.494055,0.199092,0.909706,1.716813,2.402753
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.333333,0.0,0.014286,0.0,0.0,2.0,2.0,1.0
50%,1.0,7.0,0.0,0.0,18.0,598.778571,0.003175,0.025309,0.0,0.0,2.0,2.0,3.0
75%,4.0,92.5,0.0,0.0,38.0,1464.209618,0.017123,0.05,0.0,0.0,3.0,2.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0


In [28]:
data.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                 object
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

In [30]:
for col in data:
    print(data[col].unique())

[ 0  1  2  4 12  3 10  6  5  9  8 16 13 11  7 18 14 17 19 15 24 22 21 20
 23 27 26]
[  0.         53.         64.6       ... 167.9107143 305.125
 150.3571429]
[ 0  1  2  4 16  5  3 14  6 12  7  9 10  8 11 24 13]
[  0.   120.    16.   ... 547.75 368.25 211.25]
[  1   2  10  19   0   3  16   7   6  23  13  20   8   5  32   4  45  14
  52   9  46  15  22  11  12  36  42  27  90  18  38  17 128  25  30  21
  51  26  28  31  24  50  96  49  68  98  67  55  35  37  29  34  71  63
  87  40  33  54  64  75  39 111  81  61  47  44  88 149  41  79  66  43
 258  80  62  83 173  48  58  57  56  69  82  59 109 287  53  84  78 137
 113  89  65  60 104 129  77  74  93  76  72 194 140 110 132 115  73 328
 160  86 150  95 130 151 117 124 127 125 116 105  92 157 154 220 187 112
 131 159  94 204 142 206 102 313 145  85  97 198 181 126 106 101 108 119
  70 122  91 276 100 291 114 172 217 141 133 156 136 180 135 195  99 362
 179 118 175 148 440 103 178 184 705 134 176 146 189 120 193 222 121 107
 305 199 4

In [38]:
# output dataframe to csv
data.to_csv('datasets/2. online_shoppers_intention_dirty.csv', sep=',', index=False, encoding='utf-8')