<a href="https://colab.research.google.com/github/megmenegazzi/AMD-project/blob/main/AMD_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Install pyspark packages**

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 47.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=fc53cce7006a350990e67317e694ce2ed7726a5d6df1753dd8483637044f0d51
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
!pip install -q findspark

## **Instancing pyspark rdd**

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
import pyspark
type(spark)

pyspark.sql.session.SparkSession

In [5]:
sc = spark.sparkContext

In [6]:
import os
import findspark
import pandas as pd
import numpy as np

## **Load Data**

In [7]:
os.environ["KAGGLE_USERNAME"] = "margheritamenegazzi"
os.environ["KAGGLE_KEY"] = "75953089094982034e32cf970ea2d0e2"

In [8]:
!kaggle datasets download ashirwadsangwan/imdb-dataset --unzip

Downloading imdb-dataset.zip to /content
 99% 1.02G/1.04G [00:08<00:00, 118MB/s]
100% 1.04G/1.04G [00:08<00:00, 133MB/s]


## **Clean Data**

In [9]:
# import as dataframe

data = spark.read.option("delimiter", "\t").option("header", True).csv("title.principals.tsv/data.tsv").limit(10000) 

In [10]:
# drop unwanted columns

data1 = data.drop("ordering","category","job", "characters")

In [11]:
# rename columns

data2 = data1.selectExpr("tconst as title", "nconst as actor")


In [12]:
# define funcitons to drop unwanted characters

from pyspark.sql.functions import udf,col

udf_title_change = udf(lambda title : int(title[2:]))
udf_actor_change = udf(lambda actor : int(actor[2:]))



In [13]:
# create new dataframe with clean data

data3 = data2.withColumn("title",udf_title_change(col("title")))
data4 = data3.withColumn("actor",udf_actor_change(col("actor")))

data4.show(5)

+-----+-------+
|title|  actor|
+-----+-------+
|    1|1588970|
|    1|   5690|
|    1| 374658|
|    2| 721526|
|    2|1335271|
+-----+-------+
only showing top 5 rows



## **Load and analyze Dataset on Spark**

In [14]:
# create tuples with title and actor

rdd = data4.rdd

simple_rdd = rdd.map(tuple)

simple_rdd.take(5)

[('1', '1588970'),
 ('1', '5690'),
 ('1', '374658'),
 ('2', '721526'),
 ('2', '1335271')]

In [16]:
# invert key and value, first actor then title

inverted = simple_rdd.map(lambda t : (t[1], t[0]))
inverted.take(2) 

[('1588970', '1'), ('5690', '1')]

In [17]:
# link movies with the same actor

joined = inverted.join(inverted)
joined.take(2) 

[('5690', ('1', '1')), ('5690', ('1', '5'))]

In [18]:
# remove self loops

filtered = joined.filter(lambda x : x[1][0]!= x[1][1])
filtered.take(2) 

[('5690', ('1', '5')), ('5690', ('1', '6'))]

In [19]:
# keep the link list

links = filtered.map(lambda x : x[1])
links.take(2) 

[('1', '5'), ('1', '6')]

In [20]:
# define function that computes the entries of the adjacency matrix

def adj(x,y):
  
  for elem in y:
    x.append(elem)
  return x   

In [21]:
adjacency1 = links.mapValues(lambda v: [v])
adjacency1.take(2)

[('1', ['5']), ('1', ['6'])]

In [22]:
# create the adjacency matrix

adjacency = adjacency1.reduceByKey(adj)
adjacency.take(2) 

[('1',
  ['5',
   '6',
   '7',
   '8',
   '36',
   '60',
   '76',
   '82',
   '108',
   '109',
   '110',
   '111',
   '112',
   '113',
   '135',
   '201',
   '247',
   '464',
   '7',
   '8',
   '38',
   '142',
   '192']),
 ('8',
  ['1',
   '5',
   '6',
   '7',
   '36',
   '60',
   '76',
   '82',
   '108',
   '109',
   '110',
   '111',
   '112',
   '113',
   '135',
   '201',
   '247',
   '464',
   '1',
   '7',
   '38',
   '142',
   '192'])]

In [23]:
# define function that computes the entries of the connection matrix

def conn(x):
  k, v = x
  result = []
  for vi in v:
    entry = (k, vi, 1/len(v))
    result.append(entry)
  return result



In [24]:
# create the connection matrix

connection = adjacency.flatMap(conn)
connection.take(10) 

[('1', '5', 0.043478260869565216),
 ('1', '6', 0.043478260869565216),
 ('1', '7', 0.043478260869565216),
 ('1', '8', 0.043478260869565216),
 ('1', '36', 0.043478260869565216),
 ('1', '60', 0.043478260869565216),
 ('1', '76', 0.043478260869565216),
 ('1', '82', 0.043478260869565216),
 ('1', '108', 0.043478260869565216),
 ('1', '109', 0.043478260869565216)]

In [25]:
# define the keys list

KL = sorted(connection.map(lambda x : x[0]).distinct().collect()) 

KL[:5]

['1', '10', '100', '1000', '1001']

In [26]:
# dictionary that maps every movie id to its position in sequence

dizionario = dict(zip(KL, range(len(KL)))) 


In [28]:
# define map function that takes the connection matrix and substitutes the original movie id values with the sequantial numbers from the dictionary


def remap(x):
  
  #scompose
  i,j,f=x

  #map
  i = dizionario[i]
  j = dizionario[j]

  #recompose
  nuovatupla = ((i),(j,f)) # i: arrival node, j: starting node, f: probability of going from j to i

  return nuovatupla


mapped = connection.map(remap)

In [34]:
#mapped.take(5)

[(0, (1524, 0.043478260869565216)),
 (0, (1626, 0.043478260869565216)),
 (0, (1732, 0.043478260869565216)),
 (0, (1839, 0.043478260869565216)),
 (0, (1373, 0.043478260869565216))]

In [36]:
# create the array for the page rank

n = len(KL)
page_rank = np.ones(n)/n # time t
old_page_rank = np.ones(n) # time -1

In [37]:
# define a function that measures the distance to make the page rank converge

# MSE distance between old page rank and page rank

def l2distance(v, q):
    
    if len(v) != len(q):
        raise ValueError(f'Cannot compute the distance of two vectors of size {len(v)} and {len(q)}')
    
    return sum([(q_el - v_el)**2 for v_el, q_el in zip(v, q)])

In [38]:
# compute the page rank

tolerance = 10e-70
max_iterations = 350
 
iteration = 0


while(l2distance(old_page_rank, page_rank) >= tolerance and iteration < max_iterations):
  
  
    old_page_rank = np.copy(page_rank) # at each cycle the new 'old' value becomes the old 'new' value 
    page_rank_values = (mapped
                        .mapValues(lambda v: v[1]*page_rank[v[0]]) # multiplication row-col between matrix and vector
                        .reduceByKey(lambda a, b: a+b) # sum up the results of the correct rows and columns
                        .sortByKey()
                        .collect()
                       )
  
    print(page_rank_values[:5]) # page-rank values of the first 5 movies
    page_rank = np.array([c for (i, c) in page_rank_values])
    
    print(iteration)
    iteration += 1

[(0, 0.00048638132295719845), (1, 0.00048638132295719856), (2, 0.00048638132295719796), (3, 0.0004863813229571968), (4, 0.0004863813229571879)]
0
[(0, 0.0004863813229571972), (1, 0.0004863813229571985), (2, 0.0004863813229571979), (3, 0.0004863813229571972), (4, 0.0004863813229571914)]
1
[(0, 0.0004863813229571983), (1, 0.00048638132295719856), (2, 0.00048638132295719785), (3, 0.000486381322957197), (4, 0.000486381322957191)]
2
[(0, 0.0004863813229571979), (1, 0.00048638132295719856), (2, 0.00048638132295719785), (3, 0.0004863813229571972), (4, 0.0004863813229571913)]
3
[(0, 0.00048638132295719856), (1, 0.00048638132295719856), (2, 0.00048638132295719785), (3, 0.00048638132295719704), (4, 0.0004863813229571913)]
4
[(0, 0.00048638132295719834), (1, 0.00048638132295719856), (2, 0.00048638132295719785), (3, 0.00048638132295719715), (4, 0.0004863813229571914)]
5
[(0, 0.00048638132295719867), (1, 0.00048638132295719856), (2, 0.00048638132295719785), (3, 0.0004863813229571972), (4, 0.0004863