In [26]:
## Import dependencies - pyscopg2 is a Python-PostgreSQL database adapter 

import psycopg2
import pandas as pd

In [27]:
# Establish connection to PostgreSQL database 

conn = psycopg2.connect(
    host="localhost",
    database="user_session",
    user="postgres",
    password="AAA009wn73ed")

conn.autocommit = True

In [28]:
# The curosor class allows Python code to execute PostgreSQL commands in a database session

cursor = conn.cursor()

In [29]:
# Create SQL query to select all data from an existing PostgreSQL table within the user_session database

selectQuery = ('''SELECT * FROM customer_session_data''')

# Execute the database operation through cursor.execute 

cursor.execute(selectQuery)

# cursor.fetchall returns all the records within the table, assign results to DataFrame & set column headers

userSession_Data =  pd.DataFrame(cursor.fetchall(), 
                                 columns=['Session Timestamp', 'Event Type', 'Product ID', 
                                          'Category ID', 'Category Code', 'Brand', 
                                          'Price', 'User ID', 'User Session ID'])

# Print first 5 rows of DataFrame by using ['DataFrame'].head() to check the data 

userSession_Data.head()

Unnamed: 0,Session Timestamp,Event Type,Product ID,Category ID,Category Code,Brand,Price,User ID,User Session ID
0,2019-01-10,view,44600062,2.10381e+18,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-01-10,view,3900821,2.05301e+18,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-01-10,view,17200506,2.05301e+18,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-01-10,view,1307067,2.05301e+18,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-01-10,view,1004237,2.05301e+18,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [30]:
# Check number of rows & columns by using ['DataFrame'].shape method:

# 1,048,575 ROWS
# 9 COLUMNS 

userSession_Data.shape

(1048575, 9)

In [31]:
# Check data types in each column of the DataFrame with the ['DataFrame'].dtypes method 

userSession_Data.dtypes

Session Timestamp    datetime64[ns]
Event Type                   object
Product ID                    int64
Category ID                  object
Category Code                object
Brand                        object
Price                       float64
User ID                       int64
User Session ID              object
dtype: object

In [33]:
# Change format of the Session Timestamp column to include H, M & S timeframes as the original format included Y, M & D

userSession_Data['Session Timestamp'] = pd.to_datetime(userSession_Data['Session Timestamp'],
                                                format="%Y-%m-%d %H:%M:%S")

# Print DataFrame to vaidate change in date format 

userSession_Data.head()

Unnamed: 0,Session Timestamp,Event Type,Product ID,Category ID,Category Code,Brand,Price,User ID,User Session ID
0,2019-01-10,view,44600062,2.10381e+18,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-01-10,view,3900821,2.05301e+18,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-01-10,view,17200506,2.05301e+18,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-01-10,view,1307067,2.05301e+18,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-01-10,view,1004237,2.05301e+18,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [36]:
## Split Category Code column into multiple columns based on '.' delimiter. 
# There are multiple levels of category within the data, but they are separated by full stops in the Category Code column
# Splitting the column into multiple category levels increases granularity of the data 


# Create 3 new columns to hold each Category Code Level & split the Category Code column based on the '.' delimiter
# Where Category Codes do not have additional categories, 'None' is returned 

userSession_Data[['Category Code L1', 'Category Code L2', 'Category Code L3']] = userSession_Data['Category Code'].str.split('.', n=2, expand=True)


userSession_Data.drop(['Category Code'], axis=1).head()


Unnamed: 0,Session Timestamp,Event Type,Product ID,Category ID,Brand,Price,User ID,User Session ID,Category Code L1,Category Code L2,Category Code L3
0,2019-01-10,view,44600062,2.10381e+18,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,,,
1,2019-01-10,view,3900821,2.05301e+18,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater
2,2019-01-10,view,17200506,2.05301e+18,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa
3,2019-01-10,view,1307067,2.05301e+18,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,computers,notebook,
4,2019-01-10,view,1004237,2.05301e+18,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,electronics,smartphone,


In [37]:
## Capitalize each string within the DataFrame, as they have been presented all in lower case

# Apply a lambda function to each value in the DataFrame to capitalize the first letter 
# The function converts the dtype to string and applies the capitalize argument 
userSession_Data = userSession_Data.apply(lambda x: x.astype(str).str.capitalize())

# Print results 
userSession_Data.head()

Unnamed: 0,Session Timestamp,Event Type,Product ID,Category ID,Category Code,Brand,Price,User ID,User Session ID,Category Code L1,Category Code L2,Category Code L3
0,2019-01-10 00:00:00,View,44600062,2.10381e+18,Nan,Shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,Nan,Nan,Nan
1,2019-01-10 00:00:00,View,3900821,2.05301e+18,Appliances.environment.water_heater,Aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,Appliances,Environment,Water_heater
2,2019-01-10 00:00:00,View,17200506,2.05301e+18,Furniture.living_room.sofa,Nan,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,Furniture,Living_room,Sofa
3,2019-01-10 00:00:00,View,1307067,2.05301e+18,Computers.notebook,Lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,Computers,Notebook,Nan
4,2019-01-10 00:00:00,View,1004237,2.05301e+18,Electronics.smartphone,Apple,1081.98,535871217,C6bd7419-2748-4c56-95b4-8cec9ff8b80d,Electronics,Smartphone,Nan


In [39]:
## Calculate times of the day with high and low relative user session volume 

# Count the number of events per each timeframe with the value_counts function, referencing Session Timestamp column
timeData = pd.DataFrame(userSession_Data.value_counts('Session Timestamp'), 
                  columns=['Count of Events']).reset_index()

timeData['Session Timestamp'] = 

# Sort data based on earliest session timestamp (00:00:00)
timeData.sort_values(by='Session Timestamp', inplace=True)

# Print DataFrame and sort descending by Count of Events column
timeData.sort_values(by='Count of Events', ascending=False)

Unnamed: 0,Session Timestamp,Count of Events
0,2019-01-10 16:42:00,1756
1,2019-01-10 16:36:00,1749
2,2019-01-10 16:23:00,1736
3,2019-01-10 16:18:00,1727
4,2019-01-10 16:43:00,1718
...,...,...
1019,2019-01-10 00:31:00,1
1023,2019-01-10 01:07:00,1
1021,2019-01-10 00:28:00,1
1022,2019-01-10 00:27:00,1


In [11]:
## Calculate instances of each Event Type: View, Purchase & Cart 
# This data can be used to calculate the overall conversion rate of the website

# Create new DataFrame and use the .value_counts method to count the instances of each unique Event Type
eventTypes = pd.DataFrame(userSession_Data.value_counts('Event Type'), columns=['Count of Events']).reset_index()

# The Event Types were in lower case, so str.capitalize is used to tidy the text data & amend to capital for first letter 
eventTypes['Event Type'] = eventTypes['Event Type'].str.capitalize()

# Print results 
eventTypes

Unnamed: 0,Event Type,Count of Events
0,View,1016239
1,Purchase,17296
2,Cart,15040


In [75]:
## Calculate total number of events (all categories) per brand

brandVolume = pd.DataFrame(userSession_Data.value_counts('Brand'), columns=['Count of Events']).reset_index()

# Print results

brandVolume

Unnamed: 0,Brand,Count of Events
0,Nan,155682
1,Samsung,126027
2,Apple,107634
3,Xiaomi,72507
4,Huawei,29360
...,...,...
2240,Lava,1
2241,Mstar,1
2242,Musclepharm,1
2243,Floresan,1


In [77]:
## Calculate brands with the highest purchase volume 

# groupby Brand and Event Type and use size() which returns the number of elements within an object 
# Create a new DataFrame which will show the count of each event type per unique brand name

brandEventData = pd.DataFrame(userSession_Data.groupby(["Brand", "Event Type"]).size(), columns=['Count of Events']).reset_index()

## Filter the Event Type column for 'purchase' using .loc, assign to variable named brandPurchaseData

brandPurchaseData = brandEventData.loc[brandEventData['Event Type'] == 'Purchase'] 

# Sort DataFrame from highest number of purchase events to lowest, print results 

brandPurchaseData.sort_values(by='Count of Events', ascending=False)

Unnamed: 0,Brand,Event Type,Count of Events
2393,Samsung,Purchase,4053
149,Apple,Purchase,3565
1897,Nan,Purchase,1388
2967,Xiaomi,Purchase,1337
1271,Huawei,Purchase,617
...,...,...,...
1003,Forza,Purchase,1
2167,Polti,Purchase,1
1001,Forward,Purchase,1
996,Forlux,Purchase,1


In [78]:
# Count the instances of each category code by using the value_counts() method
# This will return the count of instances for each unique category code 

categoryCodes = pd.DataFrame(userSession_Data['Category Code'].value_counts()).reset_index()

# Rename columns following the index reset, as by default it will be renamed 'index' 

categoryCodes.columns = ['Category L1', 'Count of Events']

# Extract top 100 category codes, using nlargest & assigning 100 to 'n' - This number can be amended if required
# The below DataFrame displays the best performing category codes in terms of event count

topCategoryCodes = categoryCodes.nlargest(n=100, columns=['Count of Events'])

topCategoryCodes

Unnamed: 0,Category L1,Count of Events
0,Nan,334761
1,Electronics.smartphone,286986
2,Electronics.clocks,37264
3,Computers.notebook,30108
4,Electronics.audio.headphone,27567
...,...,...
95,Auto.accessories.parktronic,337
96,Apparel.shoes.sandals,307
97,Apparel.trousers,290
98,Country_yard.lawn_mower,270
