In [26]:
# pip install hdfs
# ./hdfscli.cfg

#     [global]
#     default.alias = dev

#     [dev.alias]
#     url = http://localhost:9870

# from hdfs import Config

# client = Config().get_client('dev')
# test = client.list('/test')
# print(test)

# with client.read('/test/movies.csv') as reader:
#     movies = reader.read()

In [27]:
# pip install findspark
import findspark
findspark.init()

In [28]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark basic example").getOrCreate()

In [29]:
ratings = spark.read.csv('hdfs://localhost:9001/test/ratings.csv', header=True, inferSchema=True)

In [30]:
movies = spark.read.csv('hdfs://localhost:9001/test/movies.csv', header=True, inferSchema=True)

In [31]:
user_movie_matrix = ratings.join(movies, on="movieId", how="inner")

In [32]:
mapping_score = {
    0.5: -1,
    1: -1,
    1.5: -0.5,
    2: 0,
    2.5: 0,
    3: 0,
    3.5: 0.5,
    4: 1,
    4.5: 1.1,
    5: 1.2
}

In [33]:
from pyspark.sql.types import IntegerType, FloatType, StringType
from pyspark.sql.functions import col

map_score_udf = spark.udf.register("map_score", lambda x: mapping_score.get(x, 0), FloatType())
user_movie_matrix = user_movie_matrix.withColumn("weight", map_score_udf(col("rating")))

In [34]:
edges = user_movie_matrix.select(
    col("userId").cast(StringType()).alias("src"),
    col("movieId").alias("dst"),
    col("weight")
)

In [35]:
from pyspark.sql.functions import lit

user_vertices = user_movie_matrix.select(col("userId").cast(StringType()).alias("id")).distinct().withColumn("bipartite", lit(0))
movie_vertices = user_movie_matrix.select(col("movieId").alias("id")).distinct().withColumn("bipartite", lit(1))
vertices = user_vertices.union(movie_vertices)

In [36]:
# https://stackoverflow.com/questions/39261370/unable-to-run-a-basic-graphframes-example

# Depending on your spark version, all you have to do is download the graphframe jar corresponding to your version
# of spark here https://spark-packages.org/package/graphframes/graphframes.

# Then you'll have to copy the jar downloaded to your spark jar directory and rename it to graphframes.jar.

# # # pyspark --packages graphframes:graphframes-0.8.4-spark3.5-s_2.12 --jars graphframes-0.8.4-spark3.5-s_2.12.jar
from graphframes import GraphFrame

user_movie_graph = GraphFrame(vertices, edges)



In [37]:
user_user_edges = user_movie_graph.edges.alias("e1") \
    .join(user_movie_graph.edges.alias("e2"), col("e1.dst") == col("e2.dst")) \
    .select(
        col("e1.src").alias("src"),
        col("e2.src").alias("dst"),
        (col("e1.weight") + col("e2.weight")).alias("weight")
    ).filter(col("src") != col("dst"))

user_user_graph = GraphFrame(user_movie_graph.vertices, user_user_edges)

In [38]:
# Project movie-movie graph
movie_movie_edges = user_movie_graph.edges.alias("e1") \
    .join(user_movie_graph.edges.alias("e2"), col("e1.src") == col("e2.src")) \
    .select(
        col("e1.dst").alias("src"),
        col("e2.dst").alias("dst"),
        (col("e1.weight") + col("e2.weight")).alias("weight")
    ).filter(col("src") != col("dst"))

movie_movie_graph = GraphFrame(user_movie_graph.vertices, movie_movie_edges)

In [39]:
def create_preference_vector(user_id, user_movie_graph):
    edges = user_movie_graph.edges.filter(col("src") == user_id).rdd.map(lambda row: (row["dst"], row["weight"])).collect()
    print(f"Preference vector for user {user_id}: {edges}")
    
    #
    tot = sum([weight for _, weight in edges])
    #
    
    print(f"Total weight for user {user_id}: {tot}")
    if tot > 0:
        return {movie: weight / tot for movie, weight in edges}
    else:
        movies = user_movie_graph.vertices.filter(col("bipartite") == 1).select("id").rdd.map(lambda row: row[0]).collect()
        return {movie: 1 / len(movies) for movie in movies}

In [40]:
def predict_user(user_id, user_movie_graph, movie_movie_graph):
    p_vec = create_preference_vector(user_id, user_movie_graph)
    # print(f"Preference vector for user {user_id}: {p_vec}")
    already_seen = [movie for movie, weight in p_vec.items() if weight > 0]
    # print(f"Already seen movies for user {user_id}: {already_seen}")
    if len(already_seen) == len(p_vec):
        return []
    pagerank_results = movie_movie_graph.pageRank(resetProbability=0.95, maxIter=20)
    item_rank = pagerank_results.vertices.select("id", "pagerank").rdd.map(lambda row: (row["id"], row["pagerank"])).collectAsMap()
    recommendations = sorted(
        (movie for movie in item_rank if movie not in already_seen),
        key=lambda x: item_rank[x], reverse=True
    )
    return recommendations

In [41]:
user = "10"
s_t = predict_user(user, user_movie_graph, movie_movie_graph)
print(f"Predicted movies for user {user}: {s_t[:10]}")

Preference vector for user 10: [(296, None), (356, 0.5), (588, None), (597, 0.5), (912, None), (1028, None), (1088, None), (1247, None), (1307, None), (1784, 0.5), (1907, None), (2571, None), (2671, 0.5), (2762, None), (2858, None), (2959, None), (3578, None), (3882, None), (4246, 0.5), (4306, 1.100000023841858), (4447, 1.100000023841858), (4993, None), (4995, None), (5066, None), (5377, 0.5), (5620, None), (5943, None), (5952, None), (5957, None), (6155, None), (6266, None), (6377, 0.5), (6535, None), (6942, None), (7149, None), (7151, None), (7153, None), (7154, None), (7169, 1.100000023841858), (7293, 0.5), (7375, None), (7451, 0.5), (7458, 1.2000000476837158), (8529, 0.5), (8533, 1.2000000476837158), (8636, 0.5), (8665, 0.5), (8808, None), (8869, 1.2000000476837158), (8961, None), (8969, 0.5), (8970, None), (30749, 0.5), (31433, None), (31685, 1.100000023841858), (33145, None), (33679, None), (33794, 1.2000000476837158), (40629, None), (40819, None), (41285, None), (47099, None), (

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'