In [1]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [2]:
try:
    with driver.session() as session:
        result = session.run("RETURN 1 AS test")
    print("Connection successful")
except Exception as e:
    print(f"Connection failed: {e}")

Connection successful


In [3]:
import os
with open('./pathCSV.txt','r') as f:
    path = f.read()
os.path.exists(path)

True

In [4]:
business_path = "business.csv"
review_path = "review.csv"
# tip_path = "yelp_academic_dataset_tip.json"
user_path = "users.csv"

business_path, review_path, user_path = list(map(lambda y: path + '/' + y,[business_path, review_path, user_path]))
# business_path, review_path, user_path = list(map(lambda y:'"'+ path + '/' + y+'"',[business_path, review_path, user_path]))
print(business_path)

../filteredCSV/business.csv


In [11]:
import pandas as pd
import neo4j

users=pd.read_csv("usersAllFriends.csv")
# print(users)
users['friends']=users['friends'].fillna('').str.split(',')
# print(type(users['friends'][0]))
users['friends']=users['friends'].apply(set)
# print(type(users['friends'][0]))
# users=pd.read_csv(user_path)
# users['friends'].str.split(',')

n=50
selectRandom="""
        MATCH (a)-[:REVIEWS]->()
        RETURN a.user_id, rand() as r
        ORDER BY r Limit $n
    """
userList=driver.execute_query(selectRandom,n=n,database_="neo4j",result_transformer_=neo4j.Result.to_df)['a.user_id']
# print(userList)
accuracy_tot=0
total_tot=0
for user in userList:
	trueSet=users[users['user_id']==user]['friends'].iloc[0]
	query = """
        MATCH (n1:User {user_id:$user})-[r1:REVIEWS]->(b:Business)<-[r2:REVIEWS]-(n2:User)
        WHERE -2 < (r1.stars - r2.stars) < 2 AND NOT (n1)-[:HAS_FRIEND]->(n2)
        WITH n1, n2, b
        match  (b)--()
        with n1,n2, count(*) AS bDegree
        WITH n1, n2, SUM(1.0 / log(bDegree)) AS adamicScore

        MATCH (n1)-[:REVIEWS]->(:Business)-[:IN_CATEGORY]->(c:Category)<-[:IN_CATEGORY]-(:Business)<-[:REVIEWS]-(n2)
        WITH n1, n2, c, adamicScore, count(*) as sharedReviewCount
        match (c)--()
        WITH n1, n2, c, adamicScore, sharedReviewCount,count(*) AS cDegree

        WITH n1, n2, adamicScore, SUM(1.0*sharedReviewCount / cDegree) AS categoryBonus
        
        RETURN adamicScore + categoryBonus AS totalScore, n2.user_id order by totalScore desc limit 10
                """
	result=driver.execute_query(query,user=user,database_="neo4j",result_transformer_=neo4j.Result.to_df)['n2.user_id']
	total=len(result)
	# print(total)
	correct=0
	# print("pred",result)
	for prediction in result:
		if prediction in trueSet:
			correct+=1
	if total!=0:
		accuracy=100*correct/total
	else:
		accuracy= 0	
	accuracy_tot+=correct
	total_tot+=total
	# print(f"total:{total}, accuracy:{accuracy}%")

print(f"Average accuracy: {100*accuracy_tot/total_tot:.2f}%")

users['friends'].map(len).describe()

Average accuracy: 50.56%


count    50241.000000
mean        78.115145
std        181.774114
min          1.000000
25%         14.000000
50%         29.000000
75%         69.000000
max       5670.000000
Name: friends, dtype: float64

In [None]:
import plotly.graph_objects as go

n = 50
selectRandom = """
        MATCH (a)-[:REVIEWS]->()
        RETURN a.user_id, rand() as r
        ORDER BY r Limit $n
    """
userList = driver.execute_query(selectRandom, n=n, database_="neo4j", result_transformer_=neo4j.Result.to_df)['a.user_id']

accuracy_tot = []
reviews = []
stars = []

for user in userList:
    trueSet = users[users['user_id'] == user]['friends'].iloc[0]
    query = """
        MATCH (n1:User {user_id:$user})-[r1:REVIEWS]->(b:Business)<-[r2:REVIEWS]-(n2:User)
        WHERE -2 < (r1.stars - r2.stars) < 2 AND NOT (n1)-[:HAS_FRIEND]->(n2)
        WITH n1, n2, b
        match  (b)--()
        with n1,n2, count(*) AS bDegree
        WITH n1, n2, SUM(1.0 / log(bDegree)) AS adamicScore

        MATCH (n1)-[:REVIEWS]->(:Business)-[:IN_CATEGORY]->(c:Category)<-[:IN_CATEGORY]-(:Business)<-[:REVIEWS]-(n2)
        WITH n1, n2, c, adamicScore, count(*) as sharedReviewCount
        match (c)--()
        WITH n1, n2, c, adamicScore,count(*) AS cDegree,sharedReviewCount

        WITH n1, n2, adamicScore, SUM(1.0 * sharedReviewCount / cDegree) AS categoryBonus
        
        RETURN adamicScore + categoryBonus AS totalScore, n2.user_id order by totalScore desc limit 10
    """
    result = driver.execute_query(query, user=user, database_="neo4j", result_transformer_=neo4j.Result.to_df)['n2.user_id']
    total = len(result)
    correct = 0

    for prediction in result:
        if prediction in trueSet:
            correct += 1

    if total != 0:
        accuracy = 100 * correct / total
    else:
        accuracy = 0

    findRevs = """
        MATCH (n1:User {user_id:$user})-[r1:REVIEWS]->()
        WITH COUNT(*) AS numRevs, AVG(r1.stars) AS stars
        RETURN numRevs, stars
    """
    RevInfo = driver.execute_query(findRevs, user=user, database_="neo4j", result_transformer_=neo4j.Result.to_df)
    accuracy_tot.append(accuracy)
    reviews.append(RevInfo['numRevs'][0])
    stars.append(RevInfo['stars'][0])

# Create scatter plot for review count vs accuracy
fig1 = go.Figure()
fig1.add_trace(go.Scatter(
    x=reviews,
    y=accuracy_tot,
    mode='markers',
    name='Review Count vs Accuracy',
    marker=dict(size=8, color='blue', opacity=0.7)
))
fig1.update_layout(
    title="Review Count vs Accuracy",
    xaxis_title="Review Count",
    yaxis_title="Accuracy (%)",
    height=600,
    width=800
)

# Create scatter plot for stars vs accuracy
fig2 = go.Figure()
fig2.add_trace(go.Scatter(
    x=stars,
    y=accuracy_tot,
    mode='markers',
    name='Stars vs Accuracy',
    marker=dict(size=8, color='red', opacity=0.7)
))
fig2.update_layout(
    title="Stars vs Accuracy",
    xaxis_title="Stars",
    yaxis_title="Accuracy (%)",
    height=600,
    width=800
)

# Show the plots
fig1.show()
fig2.show()

In [14]:
from scipy.stats import pearsonr
x = users['review_count']
y = users['friends'].map(len)

corr, p_value = pearsonr(x, y)
print(f"Pearson correlation coefficient: {corr}")
print(f"P-value: {p_value}") # p calue is probability that correlation is insigificant

alpha = 0.01  # Significance level
if p_value < alpha:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")

Pearson correlation coefficient: 0.40555235045679794
P-value: 0.0
The correlation is statistically significant.
