In [63]:
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import listdir
from os.path import join
import csv, sys
import dateutil.parser
import pyspark as ps
from pyspark.sql import functions as SF
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import (StringType, DoubleType, TimestampType, NullType, IntegerType, StructType, StructField)

from random import choice
import names
from hdfs import InsecureClient
from functools import reduce
from IPython.core.interactiveshell import InteractiveShell

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Config settings

In [18]:
# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For HDFS
HDFS_PORT = 9870
HDFS_IP = "10.4.41.81"
HDFS_HOSTNAME = "alakazam.fib.upc.es"
HDFS_DEFAULT = "hdfs://alakazam.fib.upc.es:27000"
HDFS_ADDRESS = "http://{}:{}".format(HDFS_IP, HDFS_PORT)
HDFS_USER = "bdm"
HDFS_HOME = "/user/{}".format(HDFS_USER)

# For HDFS Path

hdfs_home = "{}{}".format(HDFS_DEFAULT, HDFS_HOME)

# For users
users_dir = join("formatted_data", "users")
hdfs_location = "{}/{}".format(hdfs_home, users_dir)

# For events
activities_dir = "{}/{}".format(hdfs_home, join("formatted_data", "activities"))
culture_dir = "{}/{}".format(hdfs_home, join("formatted_data", "cultural_events"))
tourist_points_dir = "{}/{}".format(hdfs_home, join("formatted_data", "touristic_points"))

In [19]:
activities_dir
culture_dir
tourist_points_dir
hdfs_location

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/activities'

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/cultural_events'

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/touristic_points'

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/users'

In [9]:
def get_hdfs_address():
    return HDFS_ADDRESS

def get_hdfs_user():
    return HDFS_USER

def get_hdfs_user_home():
    return HDFS_HOME

def get_hdfs_client():
    return InsecureClient(url=get_hdfs_address(), user=get_hdfs_user())


In [24]:
def concat_dataframes(dfs):
    """
    Concat multiple pyspark dataframe(s)

    https://www.geeksforgeeks.org/concatenate-two-pyspark-dataframes/
    """
    return reduce(lambda df1, df2: df1.union( df2.select( df1.columns ) ), dfs)

In [10]:
spark = SparkSession.builder.appName("bdm5").master('local').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/13 06:32:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/13 06:32:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/06/13 06:32:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/06/13 06:32:52 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [11]:
client = get_hdfs_client()
client

<InsecureClient(url='http://10.4.41.81:9870')>

In [12]:
def get_random_name():
    """
    Generate her names
    """
    return names.get_full_name()

In [14]:
get_random_name()

'Leola Odom'

In [16]:
files = client.list(users_dir)
if not len(files):
    # no files
    print("No users")
else:
    df = sqlContext.read.parquet(hdfs_location)
    df.show(10)

No users


In [40]:
df_activities = sqlContext.read.parquet(activities_dir)
df_culture = sqlContext.read.parquet(culture_dir)
df_tourist_points = sqlContext.read.parquet(tourist_points_dir)

In [39]:
activities_type = SF.udf(lambda : "activities", StringType())
culture_type = SF.udf(lambda : "cultural events", StringType())
tourist_points_type = SF.udf(lambda : "tourist points", StringType())

In [None]:
df_activities = df_activities.withColumn("type", activities_type())
df_culture = df_culture.withColumn("type", culture_type())
df_tourist_points = df_tourist_points.withColumn("type", tourist_points_type())

In [60]:
df = concat_dataframes([df_activities, df_culture, df_tourist_points])

In [45]:
df.show(4)

                                                                                

+------------+--------------------+------------------+-----------------+-------------------------+---------------------------+---------------------+-----------------------+-------------------+-----------------+--------------------+----------+
| register_id|                name|   geo_epgs_4326_x|  geo_epgs_4326_y|addresses_neighborhood_id|addresses_neighborhood_name|addresses_district_id|addresses_district_name|addresses_road_name|addresses_road_id|           timestamp|      type|
+------------+--------------------+------------------+-----------------+-------------------------+---------------------------+---------------------+-----------------------+-------------------+-----------------+--------------------+----------+
|﻿99400169638|Visites guiades '...| 41.38666133472934|2.171438214409541|                        2|             el Barri Gòtic|                    1|           Ciutat Vella| Plaça de Catalunya|            74404|2022-06-11 13:26:...|activities|
|﻿99400683221|'American Spac

In [47]:
df_activities.count()
df_culture.count()
df_tourist_points.count()
df.count()

                                                                                

3156

                                                                                

2364

                                                                                

855

                                                                                

6375

In [61]:
users = [get_random_name() for i in range(0, 10)]

In [62]:
users

['David Smith',
 'Donald Roberts',
 'Betty Mannino',
 'Mildred Chong',
 'Edith Tibbles',
 'Tony Barlow',
 'Antonia Jernigan',
 'Mickey Broadnax',
 'Francisco Wyatt',
 'Junior Fulton']

In [64]:
generate_random = SF.udf(lambda : choice(users), StringType())

In [65]:
df = df.withColumn("user", generate_random())

In [None]:
df.show(10)

In [48]:
df_rdd = df.rdd.sample(False, 0.1, 0)

In [53]:
df_rdd.count()

                                                                                

641

In [54]:
    generate_random = SF.udf(lambda : get_random_name(), StringType())

In [55]:
df = df.withColumn("user", generate_random())

In [68]:
df = df.select(["user", "type", "name"])

In [69]:
df = df.sample(withReplacement=True, fraction=0.5, seed=3)

In [70]:
df.groupBy("user").agg(SF.collect_list("name")).show(5)



+----------------+--------------------+
|            user|  collect_list(name)|
+----------------+--------------------+
|Antonia Jernigan|[Torneig de video...|
| Francisco Wyatt|[Mou-te!, Mou-te!...|
|     David Smith|[Visites a la pla...|
|   Betty Mannino|['Com s'inventen ...|
|   Junior Fulton|[Patis escolars o...|
+----------------+--------------------+
only showing top 5 rows



                                                                                

In [76]:
df.select(SF.collect_set("name").alias("name")).first()["name"]

                                                                                

["Visites guiades 'Mirades insubmises. Pensar la diversitat sexual i el desig a través de l’art'",
 'Tertulia literaria virtual filipina "Prosa selecta. Narraciones y ensayos”, de José Rizal',
 "Exposició 'Beijing 2022. junts per un futur compartit'",
 "Cicle de xerrades 'El cos humà per a adults'",
 "Projecte 'Caminades'",
 'Concert "Bandalos Chinos"',
 "Fundació Barcelona Media - Centre d'Innovació",
 "Campus Olímpia Poliesportiu 'Valldoreix FC' per a infants i adolescents de 6 a 17 anys al Centro Asturiano Barcelona",
 'Vermut + 1r Combat de glossa a Can baró',
 'Fira Artesanal "El Petit Artesà" a la Rambla Guipúscoa',
 "Exposició 'Tattoo. Art sota la pell'",
 "Campus Olímpia Específic de 'Fitness Jove' per a adolescents de 13 a 17 anys al CEM Nova Icària",
 '"Taller ""Patchwork"""',
 'Exposició "Igualtat de gènere i objectius de desenvolupament sostenible" Empoderament de la dona',
 'Exposició "De l\'hort a l\'art"',
 'Coach and clown',
 "Exposició 'Barcelona al carrer. Veure i ser

In [75]:
#df.select(SF.count('name').alias("total")).show()
df.select("name").distinct().show()



+--------------------+
|                name|
+--------------------+
|Torneig de videoj...|
|Monòleg amb Marc ...|
|Casal esportiu i ...|
|Tertúlia literari...|
|Espectacle famili...|
|Xerrada 'Una obra...|
|Casal d'estiu 'Mi...|
|Taller 'Hipopress...|
|Intervenció artís...|
| Festival Ubeat Live|
|Espectacle “Bros”...|
|Exposició 'Memòri...|
|'Com s'inventen e...|
|   Teatre "Ximpanzé"|
|Taller 'Creació d...|
|Espectacle "L'oll...|
|Visita dinamitzad...|
|Concert "Jimi Som...|
|Lliure al sofà - ...|
|Visites guiades -...|
+--------------------+
only showing top 20 rows



                                                                                