# Wordcount analysis example (load from CSV)

This notebook shows the classic wordcount example in which we want to analyze word counting results.

In [1]:
# To find out where the pyspark
import findspark
findspark.init()

In [2]:
# Creating Spark Context
from pyspark import SparkContext
sc = SparkContext("local", "Wordcount_rdd_analysis")



With the step below we are going to read an input (local) file that will be our data source. textFile() and wholeTextFiles() methods to read into RDD that are the low level data access of Spark (there exist other method to read directly in Dataframe).

Each line of the text file is a *row*. We can apply a series of chained operation:
1. flatMap produces a new dataset <word> from the splitting
2. map produces a new dataset in the form <word, couunt>
3. reduceByKey coordinates the aggregation by summing rows with the same key

In [3]:
sc

In [4]:
# Calculating words count
csv_file = sc.textFile("output_csv/part-00000")


In [5]:
csv_file.collect()

['brand-logo,2',
 'Homepage,2',
 'Chi,5',
 'siamo,5',
 'Soluzioni,2',
 'Lavora,4',
 'con,11',
 'noi,4',
 'Contatti,2',
 'Newsroom,2',
 'Scopri,5',
 'le,9',
 'offerte,5',
 'Lo,2',
 'shopping,6',
 'è,4',
 'la,11',
 'nostra,4',
 'passione.,2',
 'Vogliamo,2',
 'arricchire,2',
 'questa,2',
 'esperienza,2',
 'dalla,2',
 'fase,2',
 'di,30',
 'preparazione,2',
 'su,3',
 'mobile,3',
 'fino,4',
 'al,5',
 'momento,2',
 'della,2',
 'visita,2',
 'in,21',
 'negozio.,5',
 ',49',
 'Shopping,1',
 'Experts,1',
 'a,7',
 'partire,1',
 'Ci,1',
 'impegniamo,1',
 'fondo,1',
 'per,7',
 'offrire,1',
 'ai,3',
 'nostri,3',
 'shopper,1',
 'sempre,2',
 'migliore,3',
 'esperienza:,1',
 'semplice,,1',
 'efficace,2',
 'e,12',
 'divertente.,1',
 'Collaboriamo,2',
 'numerosi,1',
 'retailers,4',
 'manufacturers,2',
 'fine,1',
 'condurre,1',
 'più,7',
 'gli,2',
 'shoppers,4',
 'Tech,1',
 'Lovers,1',
 'Crediamo,1',
 'che,3',
 'tecnologia,1',
 'possa,1',
 'realmente,1',
 'supportare,2',
 'i,6',
 'nel,3',
 'guidare,1',
 'il

In [13]:
def mysplit(line):
    parts = line.split(",")
    return (parts[0], parts[1])

content = csv_file.map(mysplit)


In [14]:
len(content.collect())
content.collect()

# Note the output. The value number is a string.. we want a number. You can modify mysplit func to cast second value to a float            

[('brand-logo', '2'),
 ('Homepage', '2'),
 ('Chi', '5'),
 ('siamo', '5'),
 ('Soluzioni', '2'),
 ('Lavora', '4'),
 ('con', '11'),
 ('noi', '4'),
 ('Contatti', '2'),
 ('Newsroom', '2'),
 ('Scopri', '5'),
 ('le', '9'),
 ('offerte', '5'),
 ('Lo', '2'),
 ('shopping', '6'),
 ('è', '4'),
 ('la', '11'),
 ('nostra', '4'),
 ('passione.', '2'),
 ('Vogliamo', '2'),
 ('arricchire', '2'),
 ('questa', '2'),
 ('esperienza', '2'),
 ('dalla', '2'),
 ('fase', '2'),
 ('di', '30'),
 ('preparazione', '2'),
 ('su', '3'),
 ('mobile', '3'),
 ('fino', '4'),
 ('al', '5'),
 ('momento', '2'),
 ('della', '2'),
 ('visita', '2'),
 ('in', '21'),
 ('negozio.', '5'),
 ('', '49'),
 ('Shopping', '1'),
 ('Experts', '1'),
 ('a', '7'),
 ('partire', '1'),
 ('Ci', '1'),
 ('impegniamo', '1'),
 ('fondo', '1'),
 ('per', '7'),
 ('offrire', '1'),
 ('ai', '3'),
 ('nostri', '3'),
 ('shopper', '1'),
 ('sempre', '2'),
 ('migliore', '3'),
 ('esperienza:', '1'),
 ('semplice', ''),
 ('efficace', '2'),
 ('e', '12'),
 ('divertente.', '1'),


In [23]:
# The second argument can be empty due to some formatting problem.. casting to float can rise an error.
# You can handle work values in the function, or do a filteing before..

def myfilter(tuple_rdd):
    if tuple_rdd[0] == '' or tuple_rdd[1] == '' :
        print("Skip line")
        return False
    else:
        return True

content2 = content.filter(myfilter)
len(content2.collect())
# Now you can add cast to float


470

In [24]:
def cast_rdd(tuple_rdd):
    return (tuple_rdd[0], float(tuple_rdd[1]))

content3 = content2.map(cast_rdd)
content3.collect()

[('brand-logo', 2.0),
 ('Homepage', 2.0),
 ('Chi', 5.0),
 ('siamo', 5.0),
 ('Soluzioni', 2.0),
 ('Lavora', 4.0),
 ('con', 11.0),
 ('noi', 4.0),
 ('Contatti', 2.0),
 ('Newsroom', 2.0),
 ('Scopri', 5.0),
 ('le', 9.0),
 ('offerte', 5.0),
 ('Lo', 2.0),
 ('shopping', 6.0),
 ('è', 4.0),
 ('la', 11.0),
 ('nostra', 4.0),
 ('passione.', 2.0),
 ('Vogliamo', 2.0),
 ('arricchire', 2.0),
 ('questa', 2.0),
 ('esperienza', 2.0),
 ('dalla', 2.0),
 ('fase', 2.0),
 ('di', 30.0),
 ('preparazione', 2.0),
 ('su', 3.0),
 ('mobile', 3.0),
 ('fino', 4.0),
 ('al', 5.0),
 ('momento', 2.0),
 ('della', 2.0),
 ('visita', 2.0),
 ('in', 21.0),
 ('negozio.', 5.0),
 ('Shopping', 1.0),
 ('Experts', 1.0),
 ('a', 7.0),
 ('partire', 1.0),
 ('Ci', 1.0),
 ('impegniamo', 1.0),
 ('fondo', 1.0),
 ('per', 7.0),
 ('offrire', 1.0),
 ('ai', 3.0),
 ('nostri', 3.0),
 ('shopper', 1.0),
 ('sempre', 2.0),
 ('migliore', 3.0),
 ('esperienza:', 1.0),
 ('efficace', 2.0),
 ('e', 12.0),
 ('divertente.', 1.0),
 ('Collaboriamo', 2.0),
 ('numer

In [25]:
#def myfilter_wrong(tuple_rdd):
#    if tuple_rdd[1] == '' :
#        print("Skip line")
#        return True
#    else:
#        return False
#
#content3 = content.filter(myfilter_wrong)
#content3.collect()
#content2 = content3

In [26]:
output = content3.collect()
for (word, occurs) in output:
    print("%s: %f" % (word, occurs))

brand-logo: 2.000000
Homepage: 2.000000
Chi: 5.000000
siamo: 5.000000
Soluzioni: 2.000000
Lavora: 4.000000
con: 11.000000
noi: 4.000000
Contatti: 2.000000
Newsroom: 2.000000
Scopri: 5.000000
le: 9.000000
offerte: 5.000000
Lo: 2.000000
shopping: 6.000000
è: 4.000000
la: 11.000000
nostra: 4.000000
passione.: 2.000000
Vogliamo: 2.000000
arricchire: 2.000000
questa: 2.000000
esperienza: 2.000000
dalla: 2.000000
fase: 2.000000
di: 30.000000
preparazione: 2.000000
su: 3.000000
mobile: 3.000000
fino: 4.000000
al: 5.000000
momento: 2.000000
della: 2.000000
visita: 2.000000
in: 21.000000
negozio.: 5.000000
Shopping: 1.000000
Experts: 1.000000
a: 7.000000
partire: 1.000000
Ci: 1.000000
impegniamo: 1.000000
fondo: 1.000000
per: 7.000000
offrire: 1.000000
ai: 3.000000
nostri: 3.000000
shopper: 1.000000
sempre: 2.000000
migliore: 3.000000
esperienza:: 1.000000
efficace: 2.000000
e: 12.000000
divertente.: 1.000000
Collaboriamo: 2.000000
numerosi: 1.000000
retailers: 4.000000
manufacturers: 2.000000


In [27]:
# Order by most recurrent words
content4 = content3.sortBy(lambda x: x[1], False)
content4.collect()

[('di', 30.0),
 ('in', 21.0),
 ('e', 12.0),
 ('con', 11.0),
 ('la', 11.0),
 ('le', 9.0),
 ('il', 9.0),
 ('user-photo', 8.0),
 ('&', 8.0),
 ('I', 8.0),
 ('a', 7.0),
 ('per', 7.0),
 ('più', 7.0),
 ('milioni', 7.0),
 ('shopping', 6.0),
 ('i', 6.0),
 ('del', 6.0),
 ('stars', 6.0),
 ('—', 6.0),
 ('slideshow', 6.0),
 ('Chi', 5.0),
 ('siamo', 5.0),
 ('Scopri', 5.0),
 ('offerte', 5.0),
 ('al', 5.0),
 ('negozio.', 5.0),
 ('2020', 5.0),
 ('Lavora', 4.0),
 ('noi', 4.0),
 ('è', 4.0),
 ('nostra', 4.0),
 ('fino', 4.0),
 ('retailers', 4.0),
 ('shoppers', 4.0),
 ('negozi', 4.0),
 ('nostro', 4.0),
 ('to', 4.0),
 ('Board', 4.0),
 ('nei', 4.0),
 ('it', 4.0),
 ('su', 3.0),
 ('mobile', 3.0),
 ('ai', 3.0),
 ('nostri', 3.0),
 ('migliore', 3.0),
 ('che', 3.0),
 ('nel', 3.0),
 ('delle', 3.0),
 ('cui', 3.0),
 ('La', 3.0),
 ('VP', 3.0),
 ('Marketing', 3.0),
 ('DoveConviene', 3.0),
 ('ShopFully', 3.0),
 ('the', 3.0),
 ('is', 3.0),
 ('negozio', 3.0),
 ('blog-post-thumb', 3.0),
 ('brand-logo', 2.0),
 ('Homepage', 2

In [30]:
content4.first()

('di', 30.0)

In [31]:
content4.take(10)

[('di', 30.0),
 ('in', 21.0),
 ('e', 12.0),
 ('con', 11.0),
 ('la', 11.0),
 ('le', 9.0),
 ('il', 9.0),
 ('user-photo', 8.0),
 ('&', 8.0),
 ('I', 8.0)]

In [32]:
accum_len = sc.accumulator(0)
accum_sum = sc.accumulator(0)

def set_accum(tuple_rdd):
    accum_sum.add(tuple_rdd[1])
    accum_len.add(1)

content4.foreach(set_accum)


In [35]:
accum_len.value


470

In [36]:
accum_sum.value

836.0

In [38]:
accum_sum.value/accum_len.value

1.778723404255319

In [40]:
# Plain Python

tot_words = content4.collect()
tot_len = 0
tot_sum = 0

def calculate_mean(values):
    global tot_len, tot_sum
    
    for(word, occur) in values:
        tot_sum = tot_sum + occur
        tot_len = tot_len + 1
        
calculate_mean(tot_words)
print(tot_sum/tot_len)


1.778723404255319
