# Upload the dataset on neo4j

how to connect to your database: https://neo4j.com/docs/getting-started/languages-guides/neo4j-python/

how to import data: https://neo4j.com/docs/python-manual/current/query-simple/

best practices: https://neo4j.com/developer-blog/neo4j-driver-best-practices/

In [2]:
!pip install neo4j



In [49]:
database = "mb50"             # mb50  mb100  mb200

# Queries

### Query a

For each customer checks that the spending frequency and the spending amounts of the last
month is under the usual spending frequency and the spending amounts for the same
period.

is session.run faster? no, even slower!

In [57]:
from datetime import datetime, timedelta

from neo4j import GraphDatabase
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")

# compare last month with the same month of the last year. es: gen 2024 and gen 2023
now = datetime.strptime("2024-03-08 12:00:00", "%Y-%m-%d %H:%M:%S")
start_of_current_month = datetime(now.year, now.month, 1)
end_of_last_month = start_of_current_month - timedelta(days=1)
start_of_last_month = datetime(end_of_last_month.year, end_of_last_month.month, 1)
print(start_of_last_month)
print(start_of_current_month)

end_of_last_year = start_of_current_month - timedelta(days=365)
start_of_last_year = start_of_last_month - timedelta(days=365)
print(start_of_last_year)
print(end_of_last_year)

parameters = {
    "start_of_last_month": start_of_last_month.strftime("%Y-%m-%dT%H:%M:%S"),
    "start_of_current_month": start_of_current_month.strftime("%Y-%m-%dT%H:%M:%S"),
    
    "start_of_last_year": start_of_last_year.strftime("%Y-%m-%dT%H:%M:%S"),
    "end_of_last_year": end_of_last_year.strftime("%Y-%m-%dT%H:%M:%S"),
}
    
query = """
        MATCH (c:Customer)-[t:MADE_TRANSACTION]->(:Terminal)
        WHERE (t.tx_datetime >= datetime($start_of_last_month) AND t.tx_datetime < datetime($start_of_current_month)) OR (t.tx_datetime >= datetime($start_of_last_year) AND t.tx_datetime < datetime($end_of_last_year))
        WITH c,
           COUNT(CASE WHEN t.tx_datetime >= datetime($start_of_last_month) AND t.tx_datetime < datetime($start_of_current_month) THEN 1 ELSE NULL END) AS lastMonthFrequency,
           SUM(CASE WHEN t.tx_datetime >= datetime($start_of_last_month) AND t.tx_datetime < datetime($start_of_current_month) THEN toFloat(t.tx_amount) ELSE 0.0 END) AS lastMonthAmount,
           
           COUNT(CASE WHEN t.tx_datetime >= datetime($start_of_last_year) AND t.tx_datetime < datetime($end_of_last_year) THEN 1 ELSE NULL END) AS lastYearFrequency,
           SUM(CASE WHEN t.tx_datetime >= datetime($start_of_last_year) AND t.tx_datetime < datetime($end_of_last_year) THEN toFloat(t.tx_amount) ELSE 0.0 END) AS lastYearAmount
        
        RETURN c.customer_id AS customerId,
               CASE
                  WHEN lastMonthFrequency < lastYearFrequency
                  THEN "under the usual"
                  ELSE "over the usual"
               END AS spending_frequency,
               CASE
                  WHEN lastMonthAmount < lastYearAmount
                  THEN "under the usual"
                  ELSE "over the usual"
               END AS spending_amounts
        """

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=database) as session:
        result = session.run(query, parameters)
        
        # Loop on result
        for record in result:
            print(record)
            
        summary = result.consume()
        records_count = summary.counters.nodes_created

        print("The query returned {records_count} records.".format(records_count=records_count))

2024-02-01 00:00:00
2024-03-01 00:00:00
2023-02-01 00:00:00
2023-03-02 00:00:00
<Record customerId=0 spending_frequency='over the usual' spending_amounts='over the usual'>
<Record customerId=1 spending_frequency='under the usual' spending_amounts='over the usual'>
<Record customerId=2 spending_frequency='under the usual' spending_amounts='under the usual'>
<Record customerId=3 spending_frequency='over the usual' spending_amounts='over the usual'>
<Record customerId=4 spending_frequency='under the usual' spending_amounts='under the usual'>
<Record customerId=5 spending_frequency='over the usual' spending_amounts='under the usual'>
<Record customerId=6 spending_frequency='under the usual' spending_amounts='under the usual'>
<Record customerId=7 spending_frequency='over the usual' spending_amounts='over the usual'>
<Record customerId=8 spending_frequency='under the usual' spending_amounts='under the usual'>
<Record customerId=9 spending_frequency='over the usual' spending_amounts='under t

### Query b

For each terminal identify the possible fraudulent transactions. The fraudulent transactions
are those whose import is higher than 20% of the maximal import of the transactions
executed on the same terminal in the last month.

In [73]:
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")
 
query = """
        MATCH (:Customer)-[t:MADE_TRANSACTION]->(tm:Terminal)<-[fraud:MADE_TRANSACTION]-(:Customer)
        WHERE t.tx_datetime >= fraud.tx_datetime - duration({months: 1}) AND t.tx_datetime < fraud.tx_datetime
        WITH tm, fraud, MAX(t.tx_amount) AS max_amount
        WHERE fraud.tx_amount > max_amount*1.2
        RETURN DISTINCT tm.terminal_id as terminalId,
               COLLECT(fraud.transaction_id) as transactionsId
        ORDER BY tm.terminal_id
        """

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=database) as session:
        result = session.run(query)
        
        # Loop on result
        for record in result:
            print(record)
            
        summary = result.consume()
        records_count = summary.counters.nodes_created

        print("The query returned {records_count} records.".format(records_count=records_count))

TransientError: {code: Neo.TransientError.General.MemoryPoolOutOfMemoryError} {message: The allocation of an extra 2.0 MiB would use more than the limit 716.8 MiB. Currently using 716.0 MiB. dbms.memory.transaction.total.max threshold reached}

### Query c

Given a user u, determine the “co-customer-relationships CC of degree k”. A user u’ is a co-
customer of u if you can determine a chain “u1-t1-u2-t2-…tk-1-uk“ such that u1=u, uk=u’, and for
each 1<=I,j<=k, ui <> uj, and t1,..tk-1 are the terminals on which a transaction has been
executed. Therefore, CCk(u)={u’| a chain exists between u and u’ of degree k}. Please, note
that depending on the adopted model, the computation of CCk(u) could be quite
complicated. Consider therefore at least the computation of CC3(u) (i.e. the co-costumer
relationships of degree 3).

grado tre significa:
(u:Customer)-[:MADE_TRANSACTION]->(:Terminal)<-[:MADE_TRANSACTION]-(:Customer)-[:MADE_TRANSACTION]->(:Terminal)<-[:MADE_TRANSACTION]-(u3:Customer)

contiamo 4 relationship di tipo transaction
            
MATCH p=(c1:Customer {CUSTOMER_ID: toInteger($parameters.user_id)})-[:MADE_TRANSACTION*2]-(c2:Customer)
            WHERE c1 <> c2
            UNWIND nodes(p) as n
            WITH p, c2, count(DISTINCT n) as num
            RETURN DISTINCT c2.CUSTOMER_ID as customerId, min(num)
            LIMIT 1000
            
non so perchè ma senza limit carica all'infinito, anche se ritorna 40 tuple in ogni caso...

In [65]:
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")

parameters = {
            "user_id": "390",
        }
 
query = """
        MATCH p=(c1:Customer {customer_id: toInteger($user_id)})-[:MADE_TRANSACTION*4]-(c2:Customer)
        WHERE c1 <> c2
        RETURN DISTINCT c2.customer_id as customerId
        LIMIT 1000
        """

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=database) as session:
        result = session.run(query, parameters)
        
        # Loop on result
        for record in result:
            print(record)
            
        summary = result.consume()
        records_count = summary.counters.nodes_created

        print("The query returned {records_count} records.".format(records_count=records_count))

<Record customerId=167>
<Record customerId=128>
<Record customerId=1360>
<Record customerId=208>
<Record customerId=926>
<Record customerId=907>
<Record customerId=11>
<Record customerId=1258>
<Record customerId=927>
<Record customerId=633>
<Record customerId=1131>
<Record customerId=1383>
<Record customerId=894>
<Record customerId=1344>
<Record customerId=1498>
<Record customerId=624>
<Record customerId=1215>
<Record customerId=1018>
<Record customerId=154>
<Record customerId=1376>
<Record customerId=9>
<Record customerId=1063>
<Record customerId=1259>
<Record customerId=679>
<Record customerId=479>
<Record customerId=34>
<Record customerId=134>
<Record customerId=657>
<Record customerId=959>
<Record customerId=0>
<Record customerId=476>
<Record customerId=846>
<Record customerId=764>
<Record customerId=341>
<Record customerId=1158>
<Record customerId=547>
<Record customerId=677>
<Record customerId=64>
<Record customerId=72>
<Record customerId=1446>
<Record customerId=885>
<Record cus

### Query d.i

Extend the logical model that you have stored in the NOSQL database by introducing the
following information:

  i. Each transaction should be extended with:
      1. The period of the day {morning, afternoon, evening, night} in which the
         transaction has been executed.
      2. The kind of products that have been bought through the transaction {high-
         tech, food, clothing, consumable, other}
      3. The feeling of security expressed by the user. This is an integer value
         between 1 and 5 expressed by the user when conclude the transaction.
  The values can be chosen randomly.

In [ ]:
from neo4j import GraphDatabase
import random

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")
 
query1 = """
        MATCH (:Customer)-[t:MADE_TRANSACTION]->(:Terminal) 
        RETURN t.transaction_id
        """

query2 = """
        UNWIND  $batch AS trans
        WITH collect(trans) AS transactionList
        MATCH (:Customer)-[t:MADE_TRANSACTION]->(:Terminal)  WHERE t.transaction_id IN transactionList
        SET
            t.period_of_day = 
                CASE
                    WHEN datetime(t.tx_datetime).hour >= 6 AND datetime(t.tx_datetime).hour < 12 THEN "morning"
                    WHEN datetime(t.tx_datetime).hour >= 12 AND datetime(t.tx_datetime).hour < 18 THEN "afternoon"
                    WHEN datetime(t.tx_datetime).hour >= 18 AND datetime(t.tx_datetime).hour < 24 THEN "evening"
                    ELSE "night"
                END,
                
            t.product_type = 
                CASE
                  WHEN $rand < 0.2 THEN "high-tech"
                  WHEN $rand < 0.4 THEN "food"
                  WHEN $rand < 0.6 THEN "clothing"
                  WHEN $rand < 0.8 THEN "consumable"
                  ELSE "other"
                END,
                
            t.security_feeling = toInteger(rand() * 5) + 1
        """

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=database) as session:
        result = session.run(query1)
        
        transaction_ids = [r["t.transaction_id"] for r in result]
        BATCH_SIZE = 1000
        
        for i in range(0, len(transaction_ids), BATCH_SIZE):
            batch = transaction_ids[i:i+BATCH_SIZE]
            result = session.run(query2, batch=batch, rand=random.random())
            
        summary = result.consume()
        records_count = summary.counters.nodes_created

        print("The query returned {records_count} records.".format(records_count=records_count))

 ### Query d.ii
 
ii.  Customers that make more than three transactions from the same terminal
      expressing a similar average feeling of security should be connected as
      “buying_friends”. Therefore also this kind of relationship should be explicitly stored
      in the NOSQL database and can be queried. Note, two average feelings of security
      are considered similar when their difference is lower than 1.

In [ ]:
from neo4j import GraphDatabase
import random

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")

parameters = {
            "user_id": "390",
        }
 
query1 = """
        MATCH (c1:Customer)-[t1:MADE_TRANSACTION]->(t:Terminal)<-[t2:MADE_TRANSACTION]-(c2:Customer)
        WITH c1, c2, COUNT(DISTINCT t1) as count1, COUNT(DISTINCT t2) as count2, AVG(t1.security_feeling) as avg1, AVG(t2.security_feeling) as avg2
        WHERE c1<>c2 AND count1 > 3 AND count2 > 3 AND ABS(avg1-avg2)<1
        RETURN c1.customer_id as c1, c2.customer_id as c2
        """

query2 = """
        UNWIND $batch AS customers
        MATCH (c1:Customer {customer_id: toInteger(customers[0])})
        MATCH (c2:Customer {customer_id: toInteger(customers[1])})
        MERGE (c1)-[:BUYING_FRIENDS]-(c2)
        """

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=database) as session:
        result = session.run(query1)
        
        customers_ids = list((record["c1"], record["c2"]) for record in result)
        BATCH_SIZE = 1000
        
        for i in range(0, len(customers_ids), BATCH_SIZE):
            batch = transaction_ids[i:i+BATCH_SIZE]
            result = session.run(query2, batch=batch)
            
        summary = result.consume()
        records_count = summary.counters.nodes_created

        print("The query returned {records_count} records.".format(records_count=records_count))

### Query e

For each period of the day identifies the number of transactions that occurred in that period,
and the average number of fraudulent transactions.

In [66]:
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")

parameters = {
            "user_id": "390",
        }
 
query = """
        MATCH (:Customer)-[t:MADE_TRANSACTION]->(:Terminal)
        WITH DISTINCT t.period_of_day as period, count(t) as num, COUNT(CASE WHEN t.TX_FRAUD > 0 THEN 1 END) as numFraud
        RETURN period, num, round(toFloat(numFraud)/toFloat(num)*100, 2) as fraudAVG
        """

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=database) as session:
        result = session.run(query, parameters)
        
        # Loop on result
        for record in result:
            print(record)
            
        summary = result.consume()
        records_count = summary.counters.nodes_created

        print("The query returned {records_count} records.".format(records_count=records_count))

<Record period='afternoon' num=426053 fraudAVG=0.0>
<Record period='evening' num=147405 fraudAVG=0.0>
<Record period='night' num=146907 fraudAVG=0.0>
<Record period='morning' num=426703 fraudAVG=0.0>
The query returned 0 records.
