In [0]:
# pyspark basic 

# from python array to rdd
rdd = sc.parallelize([{"cat", 1}, {"dog", 1}, {"cat", 2}])
rdd.take(10)

Out[1]: [{1, 'cat'}, {1, 'dog'}, {2, 'cat'}]

In [0]:
# map
rdd = sc.parallelize([1, 2, 3, 4, 5])
new_rdd = rdd.map(lambda x: (x + 2))
new_rdd.collect()
# [3, 4, 5, 6, 7]

Out[2]: [3, 4, 5, 6, 7]

In [0]:
# flat map
new_rdd = rdd.flatMap(lambda x: (x - 1, x, x + 1))
new_rdd.collect()
# [0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6]

Out[3]: [0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6]

In [0]:
# filter
new_rdd = rdd.filter(lambda x: x > 3)
new_rdd.collect()
# [4, 5]

Out[4]: [4, 5]

In [0]:


# reduce by key
rdd = sc.parallelize([("cat", 1), ("dog", 1), ("cat", 2)])
rdd.reduceByKey(lambda x, y: (x + y)).collect()
# [('cat', 3), ('dog' ,1)]


Out[5]: [('dog', 1), ('cat', 3)]

In [0]:

# group by key
rdd.groupByKey().collect()
rdd.groupByKey().mapValues(list).collect()
# [('cat', [1, 2]), ('dog', [1])]

Out[6]: [('dog', [1]), ('cat', [1, 2])]

In [0]:

# join
people = sc.parallelize([(1, "Mauro"), (2, "Paola"), (3, "Claudia"), (4, "Mario")])
age = sc.parallelize([(1, 39), (2, 30), (4, 71)])
rdd = people.join(age)
rdd.collect()
# [(2, ('Paola', 30)), (4, ('Mario', 71)), (1, ('Mauro', 36))]

Out[10]: [(1, ('Mauro', 39)), (2, ('Paola', 30)), (4, ('Mario', 71))]

In [0]:
# left join
rdd1 = people.leftOuterJoin(age)
rdd1.collect()
# [(2, ('Paola', 30)), (4, ('Mario', 71)), (1, ('Mauro', 36)), (3, ('Claudia', None))]

Out[11]: [(1, ('Mauro', 39)),
 (2, ('Paola', 30)),
 (3, ('Claudia', None)),
 (4, ('Mario', 71))]

In [0]:
# filter
people = sc.parallelize([(1, "Mauro"), (2, "Paola"), (3, "Claudia"), (4, "Mario")])
age = sc.parallelize([(1, 39), (2, 30), (4, 71)])
rdd1 = people.leftOuterJoin(age)
rdd1.filter(lambda x: x[1][1] is not None and x[1][1] > 30).collect()
# [(4, ('Mario', 71)), (1, ('Mauro', 36))]

Out[13]: [(1, ('Mauro', 39)), (4, ('Mario', 71))]

In [0]:

# take
people = sc.parallelize([(1, "Mauro"), (2, "Paola"), (3, "Claudia"), (4, "Mario")])
age = sc.parallelize([(1, 38), (2, 30), (4, 71)])
rdd1 = people.leftOuterJoin(age)
rdd1_filter = rdd1.filter(lambda x: x[1][1] is not None and x[1][1] > 30)
rdd1.take(2)
# [(4, ('Mario', 71)), (1, ('Mauro', 36))]

Out[14]: [(1, ('Mauro', 38)), (2, ('Paola', 30))]

In [0]:

# count
people = sc.parallelize([(1, "Mauro"), (2, "Paola"), (3, "Claudia"), (4, "Mario")])
age = sc.parallelize([(1, 39), (2, 30), (4, 71)])
rdd1 = people.leftOuterJoin(age)
rdd1.count()
# 4


Out[15]: 4

In [0]:


# age average
people = sc.parallelize([(1, "Mauro"), (2, "Paola"), (3, "Claudia"), (4, "Mario")])
age = sc.parallelize([(1, 39), (2, 30), (4, 71)])
rdd = people.join(age)

In [0]:

# age average
people = sc.parallelize([(1, "Mauro"), (2, "Paola"), (3, "Claudia"), (4, "Mario")])
age = sc.parallelize([(1, 39), (2, 30), (4, 71)])
rdd = people.join(age)

rdd.map(lambda x: x[1][1]).reduce(lambda x, y: x + y) / rdd.count()


Out[19]: 46.666666666666664

In [0]:
rdd.map(lambda x: x[1][1]).sum() / rdd.count()
rdd.map(lambda x: x[1][1]).mean()

Out[20]: 46.666666666666664

In [0]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/mauropelucchi/unimib_masterbi_2022/main/datasets/Indiegogo/1_ds_project_details_full.csv", "/tmp/1_ds_project_details_full.csv")
dbutils.fs.mv("file:/tmp/1_ds_project_details_full.csv","dbfs:/data/1_ds_project_details_full.csv")
              
urllib.request.urlretrieve("https://raw.githubusercontent.com/mauropelucchi/unimib_masterbi_2022/main/datasets/Indiegogo/2_ds_project_location_full.csv", "/tmp/2_ds_project_location_full.csv")
dbutils.fs.mv("file:/tmp/2_ds_project_location_full.csv","dbfs:/data/2_ds_project_location_full.csv")

Out[21]: True

In [0]:

data = spark.read.option("header", "true").option("delimiter",",").csv("dbfs:/data/1_ds_project_details_full.csv")

data.printSchema()
data.count()
data.createOrReplaceTempView("projects")

root
 |-- _c0: string (nullable = true)
 |-- bullet_point: string (nullable = true)
 |-- category: string (nullable = true)
 |-- category_url: string (nullable = true)
 |-- clickthrough_url: string (nullable = true)
 |-- close_date: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- funds_raised_amount: string (nullable = true)
 |-- funds_raised_percent: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- is_indemand: string (nullable = true)
 |-- is_pre_launch: string (nullable = true)
 |-- offered_by: string (nullable = true)
 |-- open_date: string (nullable = true)
 |-- perk_goal_percentage: string (nullable = true)
 |-- perks_claimed: string (nullable = true)
 |-- price_offered: string (nullable = true)
 |-- price_retail: string (nullable = true)
 |-- product_stage: string (nullable = true)
 |-- project_id: string (nullable = true)
 |-- project_type: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- tags: string (nullable = t

In [0]:
display(data)

_c0,bullet_point,category,category_url,clickthrough_url,close_date,currency,funds_raised_amount,funds_raised_percent,image_url,is_indemand,is_pre_launch,offered_by,open_date,perk_goal_percentage,perks_claimed,price_offered,price_retail,product_stage,project_id,project_type,tagline,tags,title
0,,Film,/explore/film,/projects/super-troopers-2,2015-04-24T23:59:59-07:00,USD,4617223,2.081839,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/ihngkgslmmpbwcaxurio.jpg",False,False,,2015-03-24T10:00:57-07:00,,,,,,1166581.0,campaign,"The #SuperTroopers2 campaign is over, but the movie will be out in theaters on 4/20/18!",['other'],Super Troopers 2
1,,Web Series & TV Shows,/explore/web-series-tv-shows,/projects/con-man,2015-04-10T23:59:59-07:00,USD,3156178,7.347458823529411,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/fxniv9n1jtye9da7gyhq.jpg",False,False,,2015-03-10T14:48:01-07:00,,,,,,1143140.0,campaign,A new comedy from Alan Tudyk and Nathan Fillion produced by YOU!,['other'],Con Man
2,,Photography,/explore/photography,/projects/the-camera-pack-peter-mckinnon-x-nomatic,2019-11-14T23:59:59-08:00,USD,2677592,22.86193,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/hywcddtxwxi88d3imwve.jpg",False,False,,2019-11-13T23:59:59-08:00,,,,,,2558245.0,campaign,"A Functional Camera Pack for all types of travelers! Just you, one bag, and the adventure!","['backpacks', 'design', 'luggage', 'professional']",The Camera Pack: Peter McKinnon X NOMATIC
3,,Writing & Publishing,/explore/writing-publishing,/projects/the-book--28,2021-01-04T23:59:59-08:00,USD,2621668,293.336625,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/d7tln1i9lmwwpmamrla9.jpg",True,False,,2021-01-03T23:59:59-08:00,,,,,,2650630.0,campaign,The Ultimate Guide To Rebuilding A Civilization. Over 400 pages of detailed illustrations.,"['books', 'burning man', 'design']",The Book
4,,Film,/explore/film,/projects/code-8-a-film-from-robbie-stephen-amell,2016-04-23T23:59:59-07:00,USD,2501972,8.600755,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/rzpfberfrqgm4tcznn76.jpg",False,False,,2016-03-22T08:06:31-07:00,,,,,,1676513.0,campaign,Help Robbie & Stephen Amell make their first feature film together!,['robots'],Code 8 - a film from Robbie & Stephen Amell
5,,Film,/explore/film,/projects/lazer-team-by-rooster-teeth,2014-07-06T23:59:59-07:00,USD,2480334,3.8158984615384615,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/przv0rwynaloieza951h.jpg",False,False,,2014-06-06T16:23:33-07:00,,,,,,814501.0,campaign,Rooster Teeth is making its first feature length movie and we need your help!,['other'],Lazer Team by Rooster Teeth
6,,Video Games,/explore/video-games,/projects/genki-shadowcast-for-the-nintendo-switch-ps5-xbox,2021-01-27T23:59:59-08:00,USD,2462518,64.38603333333333,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/lvegbqoliupkwvefaf3k.jpg",False,False,,2021-01-26T23:59:59-08:00,,,,,,2656903.0,campaign,"The easy way to play console games on your laptop. No TV needed to play, stream, or record gameplay.","['laptops', 'software', 'toys', 'design', 'vr']",GENKI: ShadowCast for the Nintendo Switch PS5 Xbox
7,,Film,/explore/film,/projects/gosnell-movie,2014-05-12T23:59:59-07:00,USD,2377647,1.0671633333333332,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/aqsi0ssfoflcwpzg9ljv.jpg",False,False,,2014-03-28T13:37:20-07:00,,,,,,731457.0,campaign,A historic crowdfunding campaign for a movie about America's biggest serial killer Kermit Gosnell.,['other'],Gosnell Movie
8,,Video Games,/explore/video-games,/projects/indivisible-rpg-from-the-creators-of-skullgirls,2015-12-04T23:59:59-08:00,USD,2009010,1.2416133333333332,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/jlcxf8afxuw1kzz82opb.jpg",False,False,,2015-10-05T05:59:39-07:00,,,,,,1383172.0,campaign,Indivisible - A 2D Action RPG from Lab Zero PS4/XB1/Win/Mac/Linux,['other'],Indivisible - RPG from the Creators of Skullgirls
9,,Video Games,/explore/video-games,/projects/genki-covert-dock-for-the-nintendo-switch,2019-09-13T23:59:59-07:00,USD,2002310,35.81734,"https://c1.iggcdn.com/indiegogo-media-prod-cld/image/upload/c_fill,f_auto,h_273,w_273/xqqveeiuakk8s2aspxby.jpg",False,False,,2019-09-12T23:59:59-07:00,,,,,,2543163.0,campaign,A stealth dock hidden in a portable GaN-charger. Set your dock free and make any TV your playground.,"['chargers', 'adventure', 'laptops']",GENKI: Covert Dock for the Nintendo Switch


In [0]:
locations = spark.read.option("header", "true").option("delimiter",",").csv("dbfs:/data/2_ds_project_location_full.csv")
locations.count()
locations.createOrReplaceTempView("locations")

In [0]:
display(locations)

_c0,project_id,lat,lng
0,1166581,34.052238,-118.243344
1,1143140,34.052238,-118.243344
2,2558245,40.758478,-111.888142
3,2650630,34.052238,-118.243344
4,1676513,43.651893,-79.381713
5,814501,30.264979,-97.746598
6,2656903,34.052238,-118.243344
7,731457,34.052238,-118.243344
8,1383172,34.052238,-118.243344
9,2543163,34.052238,-118.243344


In [0]:
dx = spark.sql("select * from projects where title is not null")
dy = dx.dropDuplicates()
dy.count()

Out[26]: 4769