# Customer segmentation
Data set from kaggle: https://www.kaggle.com/code/fabiendaniel/customer-segmentation/

In [None]:
%%capture
%pip install graphdatascience pandas ipython numpy

In [None]:
import pandas as pd
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
import numpy as np

In [None]:
# Neo4j Connection details
DB_ULR = 'neo4j://localhost:7687'
DB_USER = 'neo4j'
DB_PASS = 'test1234'
DB_NAME = 'custseg'
gds = GraphDataScience.from_neo4j_driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds.version()

In [None]:
gds.run_cypher("create database {dbname} if not exists wait".format(dbname = DB_NAME), database="system")

In [None]:
gds.set_database(DB_NAME)

In [None]:
# Read the csv file with pandas
raw_csv = pd.read_csv('./datasets/data.csv',encoding="ISO-8859-1",
                         dtype={'CustomerID': str,'InvoiceID': str})
raw_csv.head()

In [None]:
# Select customers
df_customers = raw_csv[['CustomerID','Country']].drop_duplicates().dropna()
df_customers.head()

In [None]:
# Create Customer nodes
gds.run_cypher('create constraint if not exists for (n:Customer) require (n.id) is node key')
create_customer_res = gds.run_cypher('''
    unwind $data as row
    merge (n:Customer{id: row.CustomerID})
        set n.country = row.Country
    return count(*) as custmers_created
''', params = {'data': df_customers.to_dict('records')})
create_customer_res.head()

In [None]:
# Select products
df_products = raw_csv[['StockCode','UnitPrice']].drop_duplicates().dropna()

# We want to store prices in cents so we can represent them as 
# integer values to avoid rounding errors later on
df_products['UnitPrice'] = df_products['UnitPrice'] * 100 
df_products.head()

In [None]:
# Create Product nodes
gds.run_cypher('create constraint if not exists for (n:Product) require (n.id) is node key')
create_product_res = gds.run_cypher('''
    unwind $data as row
    merge (n:Product{id: row.StockCode})
        set n.price = toInteger(row.UnitPrice)
    return count(*) as products_created
''', params = {'data': df_products.to_dict('records')})
create_product_res.head()

In [None]:
# Select invoice data
df_invoices = raw_csv[['InvoiceNo', 'InvoiceDate']].drop_duplicates().dropna()
df_invoices.head()

In [None]:
# Create Invoice nodes
gds.run_cypher('create constraint if not exists for (n:Invoice) require (n.id) is node key')
create_invoice_res = gds.run_cypher('''
    unwind $data as row
    with row,  apoc.date.parse(row.InvoiceDate, 'ms', 'dd/MM/yyyy HH:mm') as ms
    merge (n:Invoice{id: row.InvoiceNo})
        set n.invoice_date = datetime( { epochmillis: ms } )
    return count(*) as invoices_created
''', params = {'data': df_invoices.to_dict('records')})
create_invoice_res.head()

In [None]:
# Select data for cust - invoice relationship
df_billed_to = raw_csv[['CustomerID','InvoiceNo']].drop_duplicates().dropna()
df_billed_to.head()

In [None]:
# Create billed_to relationship
create_bill_to_res = gds.run_cypher('''
    unwind $data as row
    match (i:Invoice{id: row.InvoiceNo}), (c:Customer{id: row.CustomerID})
    merge (i)-[:billed_to]->(c)
    return count(*) as bill_to_rels_created
''', params = {'data': df_billed_to.to_dict('records')})
create_bill_to_res.head()

In [None]:
# Select data for invoice - product relationship
df_line_item = raw_csv[['InvoiceNo','StockCode','Quantity']].drop_duplicates().dropna()
df_line_item.head()

In [None]:
# Create line_item relationship
create_line_item_res = gds.run_cypher('''
    unwind $data as row
    match (i:Invoice{id: row.InvoiceNo}), (p:Product{id: row.StockCode})
    merge (p)-[li:line_item]->(i)
        set li.qty = toInteger(row.Quantity)
    return count(*) as line_item_rels_created
''', params = {'data': df_line_item.to_dict('records')})
create_line_item_res.head()

## Graph model so far
![](./images/graph_model.png)