In [2]:
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import listdir
from os.path import join
import csv, sys
import dateutil.parser
import pyspark as ps
from pyspark.sql import functions as SF
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import (StringType, DoubleType, TimestampType, NullType, IntegerType, StructType, StructField)

from random import choice
import names
from hdfs import InsecureClient
from functools import reduce
from IPython.core.interactiveshell import InteractiveShell

In [3]:
#! /home/bdm/miniconda3/bin/python
HDFS_PORT = 9870
HDFS_IP = "10.4.41.81"
HDFS_HOSTNAME = "alakazam.fib.upc.es"
HDFS_DEFAULT = "hdfs://alakazam.fib.upc.es:27000"
HDFS_ADDRESS = "http://{}:{}".format(HDFS_IP, HDFS_PORT)
HDFS_USER = "bdm"
HDFS_HOME = "/user/{}".format(HDFS_USER)

def get_hdfs_home():
    return "{}{}".format(HDFS_DEFAULT, HDFS_HOME)
parent_dir = "formatted_data"
# For HDFS Path
hdfs_home = get_hdfs_home()
# For users
users_dir = "{}/{}".format(hdfs_home, join(parent_dir, "users"))


In [4]:
users_dir

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/users'

Config settings

In [18]:
# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For HDFS
HDFS_PORT = 9870
HDFS_IP = "10.4.41.81"
HDFS_HOSTNAME = "alakazam.fib.upc.es"
HDFS_DEFAULT = "hdfs://alakazam.fib.upc.es:27000"
HDFS_ADDRESS = "http://{}:{}".format(HDFS_IP, HDFS_PORT)
HDFS_USER = "bdm"
HDFS_HOME = "/user/{}".format(HDFS_USER)

# For HDFS Path

hdfs_home = "{}{}".format(HDFS_DEFAULT, HDFS_HOME)

# For users
users_dir = join("formatted_data", "users")
hdfs_location = "{}/{}".format(hdfs_home, users_dir)

# For events
activities_dir = "{}/{}".format(hdfs_home, join("formatted_data", "activities"))
culture_dir = "{}/{}".format(hdfs_home, join("formatted_data", "cultural_events"))
tourist_points_dir = "{}/{}".format(hdfs_home, join("formatted_data", "touristic_points"))

In [19]:
activities_dir
culture_dir
tourist_points_dir
hdfs_location

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/activities'

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/cultural_events'

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/touristic_points'

'hdfs://alakazam.fib.upc.es:27000/user/bdm/formatted_data/users'

In [9]:
def get_hdfs_address():
    return HDFS_ADDRESS

def get_hdfs_user():
    return HDFS_USER

def get_hdfs_user_home():
    return HDFS_HOME

def get_hdfs_client():
    return InsecureClient(url=get_hdfs_address(), user=get_hdfs_user())


In [24]:
def concat_dataframes(dfs):
    """
    Concat multiple pyspark dataframe(s)

    https://www.geeksforgeeks.org/concatenate-two-pyspark-dataframes/
    """
    return reduce(lambda df1, df2: df1.union( df2.select( df1.columns ) ), dfs)

In [10]:
spark = SparkSession.builder.appName("bdm5").master('local').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/13 06:32:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/13 06:32:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/06/13 06:32:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/06/13 06:32:52 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [11]:
client = get_hdfs_client()
client

<InsecureClient(url='http://10.4.41.81:9870')>

In [12]:
def get_random_name():
    """
    Generate her names
    """
    return names.get_full_name()

In [14]:
get_random_name()

'Leola Odom'

In [16]:
files = client.list(users_dir)
if not len(files):
    # no files
    print("No users")
else:
    df = sqlContext.read.parquet(hdfs_location)
    df.show(10)

No users


In [40]:
df_activities = sqlContext.read.parquet(activities_dir)
df_culture = sqlContext.read.parquet(culture_dir)
df_tourist_points = sqlContext.read.parquet(tourist_points_dir)

In [39]:
activities_type = SF.udf(lambda : "activities", StringType())
culture_type = SF.udf(lambda : "cultural events", StringType())
tourist_points_type = SF.udf(lambda : "tourist points", StringType())

In [None]:
df_activities = df_activities.withColumn("type", activities_type())
df_culture = df_culture.withColumn("type", culture_type())
df_tourist_points = df_tourist_points.withColumn("type", tourist_points_type())

In [91]:
df = concat_dataframes([df_activities, df_culture, df_tourist_points])

In [45]:
df.show(4)

                                                                                

+------------+--------------------+------------------+-----------------+-------------------------+---------------------------+---------------------+-----------------------+-------------------+-----------------+--------------------+----------+
| register_id|                name|   geo_epgs_4326_x|  geo_epgs_4326_y|addresses_neighborhood_id|addresses_neighborhood_name|addresses_district_id|addresses_district_name|addresses_road_name|addresses_road_id|           timestamp|      type|
+------------+--------------------+------------------+-----------------+-------------------------+---------------------------+---------------------+-----------------------+-------------------+-----------------+--------------------+----------+
|﻿99400169638|Visites guiades '...| 41.38666133472934|2.171438214409541|                        2|             el Barri Gòtic|                    1|           Ciutat Vella| Plaça de Catalunya|            74404|2022-06-11 13:26:...|activities|
|﻿99400683221|'American Spac

In [47]:
df_activities.count()
df_culture.count()
df_tourist_points.count()
df.count()

                                                                                

3156

                                                                                

2364

                                                                                

855

                                                                                

6375

In [61]:
users = [get_random_name() for i in range(0, 10)]

In [62]:
users

['David Smith',
 'Donald Roberts',
 'Betty Mannino',
 'Mildred Chong',
 'Edith Tibbles',
 'Tony Barlow',
 'Antonia Jernigan',
 'Mickey Broadnax',
 'Francisco Wyatt',
 'Junior Fulton']

In [92]:
generate_random = SF.udf(lambda : choice(users), StringType())

In [93]:
df = df.withColumn("user", generate_random())

In [None]:
df.show(10)

In [48]:
df_rdd = df.rdd.sample(False, 0.1, 0)

In [53]:
df_rdd.count()

                                                                                

641

In [54]:
    generate_random = SF.udf(lambda : get_random_name(), StringType())

In [55]:
df = df.withColumn("user", generate_random())

In [94]:
df.printSchema()

root
 |-- register_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- geo_epgs_4326_x: string (nullable = true)
 |-- geo_epgs_4326_y: string (nullable = true)
 |-- addresses_neighborhood_id: string (nullable = true)
 |-- addresses_neighborhood_name: string (nullable = true)
 |-- addresses_district_id: string (nullable = true)
 |-- addresses_district_name: string (nullable = true)
 |-- addresses_road_name: string (nullable = true)
 |-- addresses_road_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- type: string (nullable = true)
 |-- user: string (nullable = true)



In [95]:
df = df.select(["user", "type", "name", "register_id"])

In [96]:
df = df.sample(withReplacement=True, fraction=0.5, seed=3)

In [85]:
from pyspark.ml.fpm import FPGrowth


In [97]:
d = df.groupBy("user").agg(SF.collect_list("register_id").alias("items"))

In [98]:
d = d.select("items")

In [99]:
fp = FPGrowth(minSupport=0.2, minConfidence=0.7)

In [100]:
fpm = fp.fit(d)

In [79]:
#df.select(SF.collect_set("name").alias("name")).first()["name"]

df1 = df.groupBy('type', 'name').agg(SF.count('name').alias('trip_count'))
df2 = df1.sort(df1.trip_count.desc()).show()



+---------------+--------------------+----------+
|           type|                name|trip_count|
+---------------+--------------------+----------+
|cultural events|    Taller 'Pilates'|        16|
|cultural events|       Taller 'Ioga'|         9|
|cultural events|     Taller 'Teatre'|         6|
|cultural events|Taller 'Marxa nòr...|         5|
|cultural events|Taller 'Estiraments'|         5|
|     activities|     Cinema solidari|         4|
|     activities|Visita guiada 'El...|         4|
|     activities|Dates de Festa Major|         4|
|     activities|Taller 'Gipsy ori...|         4|
|cultural events|Taller híbrid 'Fe...|         4|
|cultural events|Taller 'Iniciació...|         4|
|cultural events|      Taller 'Zumba'|         4|
|cultural events|Taller 'Escriptur...|         4|
|cultural events|Taller 'Dansa con...|         4|
|cultural events|Presentació del p...|         4|
|cultural events|Taller 'Dibuix i ...|         4|
|cultural events|Taller 'Iniciació...|         4|


                                                                                

In [75]:
#df.select(SF.count('name').alias("total")).show()
df.select("name").distinct().show()



+--------------------+
|                name|
+--------------------+
|Torneig de videoj...|
|Monòleg amb Marc ...|
|Casal esportiu i ...|
|Tertúlia literari...|
|Espectacle famili...|
|Xerrada 'Una obra...|
|Casal d'estiu 'Mi...|
|Taller 'Hipopress...|
|Intervenció artís...|
| Festival Ubeat Live|
|Espectacle “Bros”...|
|Exposició 'Memòri...|
|'Com s'inventen e...|
|   Teatre "Ximpanzé"|
|Taller 'Creació d...|
|Espectacle "L'oll...|
|Visita dinamitzad...|
|Concert "Jimi Som...|
|Lliure al sofà - ...|
|Visites guiades -...|
+--------------------+
only showing top 20 rows



                                                                                