In [1]:
from py2neo import Graph
import pandas as pd

ModuleNotFoundError: No module named 'py2neo'

In [None]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "neo"))

# Discrete Features

Let's start by predicting reviews based only on discrete features.

In [None]:
query = """
MATCH (:Category {name: $category})<-[:IN_CATEGORY]-(business:Business)-[:IN_CITY]->(:City {name: $city})
MATCH (business:Business)<-[:REVIEWS]-(review:Review)<-[:WROTE]-(user:User)
WITH business, user,
     size((user)-[:FRIENDS]->()) AS userFriends,
     review.stars AS stars,
     review
LIMIT 1000
OPTIONAL MATCH (business)<-[:REVIEWS]-(otherRev)<-[:WROTE]-(other:User)-[:FRIENDS]->(user) WHERE otherRev.date < review.date
WITH  user, business, userFriends, avg(otherRev.stars) AS aveOtherStars, stars
OPTIONAL MATCH (user)-[:WROTE]->(review) WHERE not((review)-[:REVIEWS]->(business))
RETURN user.id AS userId,
       userFriends,
       aveOtherStars,
       avg(review.stars) AS aveMyStars,
       CASE WHEN stars > 3 THEN "true" ELSE "false" END as stars
"""

df = graph.run(query, {"city": "Las Vegas", "category": "Restaurants"}).to_data_frame()
display(df.head())

# How many tips has the user given?
# How many tips does the business have? 
# Features to add
# Is the business in the same cluster as the user?
# Influential friend ranked it highly? (PR bigger than $score)
# Influential friends who ranked it lowly?
# Photos and tips?
# Triadic balance / Triangle count? 
# Cluster based on people reviewing the same places

Unnamed: 0,aveMyStars,aveOtherStars,stars,userFriends,userId
0,5.0,,True,0,vefBMC37_FtVXA6gdGcRKg
1,3.6,,True,44,5gKZq8-yNns_z5UBlRQggQ
2,3.821918,4.0,True,149,v3YOx9T4jRSw8XP9Rd9H4g
3,3.304348,,True,17,8bRfSEzLoDoa8cjHOWOljw
4,3.25,,False,1,jbn7XQV7CngRu0sA6cZN1Q


In [None]:
columns = ['userFriends', "aveOtherStars", "aveMyStars"]
X = df[columns]
y = df[['stars']]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier

t0 = time.clock()

random_forest = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=1)
random_forest.fit(X_train, y_train.values.ravel())

y_predict = random_forest.predict(X_test)

display(accuracy_score(y_test, y_predict))
display(precision_score(y_test, y_predict, average="binary"))
display(recall_score(y_test, y_predict, average="binary"))

run_time = time.clock() - t0
print('RFC max-depth=10 and n-estimator=30 run in %.3f s' % run_time)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
predictions_df = pd.DataFrame({"predict": y_predict, "actual": y_test["stars"]})
display(predictions_df.head(10))

Unnamed: 0,predict,actual
44,True,True
172,True,True
163,False,True
35,False,True
136,True,True
11,True,True
123,True,True
82,True,True
175,True,False
102,True,True


In [None]:
for score, feature in zip(random_forest.feature_importances_, columns):
    print(feature, score)

userFriends 0.3543850096691566
aveOtherStars 0.1958516374111908
aveMyStars 0.44976335291965264
