In [14]:
## Import dependencies - pyscopg2 is a Python-PostgreSQL database adapter 

import psycopg2
import pandas as pd

In [15]:
# Establish connection to PostgreSQL database 

conn = psycopg2.connect(
    host="localhost",
    database="user_session",
    user="postgres",
    password="AAA009wn73ed")

conn.autocommit = True

In [16]:
# The curosor class allows Python code to execute PostgreSQL commands in a database session

cursor = conn.cursor()

In [35]:
# Create SQL query to select all data from an existing PostgreSQL table within the user_session database

selectQuery = ('''SELECT * FROM customer_session_data''')

# Execute the database operation through cursor.execute 

cursor.execute(selectQuery)

# cursor.fetchall returns all the records within the table, assign results to DataFrame & set column headers

userSession_Data =  pd.DataFrame(cursor.fetchall(), 
                                 columns=['Session Timestamp', 'Event Type', 'Product ID', 
                                          'Category ID', 'Category Code', 'Brand', 
                                          'Price', 'User ID', 'User Session ID'])

# Print first 5 rows of DataFrame by using ['DataFrame'].head() to check the data 

userSession_Data.head()

Unnamed: 0,Session Timestamp,Event Type,Product ID,Category ID,Category Code,Brand,Price,User ID,User Session ID
0,2019-01-10,view,44600062,2.10381e+18,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-01-10,view,3900821,2.05301e+18,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-01-10,view,17200506,2.05301e+18,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-01-10,view,1307067,2.05301e+18,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-01-10,view,1004237,2.05301e+18,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [45]:
# Check number of rows & columns by using ['DataFrame'].shape method:

# 1,048,575 ROWS
# 9 COLUMNS 

userSession_Data.shape

(1048575, 9)

In [46]:
# Check data types in each column of the DataFrame with the ['DataFrame'].dtypes method 

userSession_Data.dtypes

Session Timestamp    datetime64[ns]
Event Type                   object
Product ID                    int64
Category ID                  object
Category Code                object
Brand                        object
Price                       float64
User ID                       int64
User Session ID              object
dtype: object

In [51]:
# Change format of the Session Timestamp column to include H, M & S timeframes as the original format included Y, M & D

userSession_Data['Session Timestamp'] = pd.to_datetime(userSession_Data['Session Timestamp'],
                                                format="%Y-%m-%d %H:%M:%S")

In [58]:
# Print DataFrame to confirm time format has been amended successfully 

userSession_Data

Unnamed: 0,Session Timestamp,Event Type,Product ID,Category ID,Category Code,Brand,Price,User ID,User Session ID
0,2019-01-10 00:00:00,view,44600062,2.10381E+18,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-01-10 00:00:00,view,3900821,2.05301E+18,appliances.environment.water_heater,aqua,33.20,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-01-10 00:00:00,view,17200506,2.05301E+18,furniture.living_room.sofa,,543.10,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-01-10 00:00:00,view,1307067,2.05301E+18,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-01-10 00:00:00,view,1004237,2.05301E+18,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d
...,...,...,...,...,...,...,...,...,...
1048570,2019-01-10 17:27:00,view,1005105,2.05301E+18,electronics.smartphone,apple,1415.48,537482499,60b0e052-920c-4469-9627-952aa88d0b16
1048571,2019-01-10 17:27:00,view,2601292,2.05301E+18,,gefest,47.62,553278643,97ca518d-44df-4fca-8081-4d2d95b85607
1048572,2019-01-10 17:27:00,view,3601241,2.05301E+18,appliances.kitchen.washer,lg,350.70,552637214,18f1b24c-dae4-4d7a-a470-3f78c6b15533
1048573,2019-01-10 17:27:00,view,1004754,2.05301E+18,electronics.smartphone,honor,257.38,542200836,fe582251-252a-4b79-af15-7c5c5ce8c6f1


In [119]:
## Calculate instances of each Event Type: View, Purchase & Cart 
# This data can be used to calculate the overall conversion rate of the website

# Create new DataFrame and use the .value_counts method to count the instances of each unique Event Type
eventTypes = pd.DataFrame(userSession_Data.value_counts('Event Type'), columns=['Count of Events']).reset_index()

# The Event Types were in lower case, so str.capitalize is used to tidy the text data & amend to capital for first letter 
eventTypes['Event Type'] = eventTypes['Event Type'].str.capitalize()

# Print results 
eventTypes

Unnamed: 0,Event Type,Count of Events
0,View,1016239
1,Purchase,17296
2,Cart,15040


In [120]:
# Sum the total Count of Events to understand the net volume of visitors to the site

totalVisits = eventTypes['Count of Events'].sum()

In [144]:
# Count the instances of each category code by using the value_counts() method
# This will return the count of instances for each unique category code 

categoryCodes = pd.DataFrame(userSession_Data['Category Code'].value_counts())

# Extract top 20 category codes, using nlargest & assigning 20 to 'n' - This number can be amended if required

topCategoryCodes = categoryCodes.nlargest(n=20, columns=['Category Code'])