In [1]:
from pyspark import SparkContext
# initialize a new Spark Context to use for the execution of the script
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")

In [101]:
numbers = [1,2,3,4,5]
rdd_numbers = sc.parallelize(numbers)
print(numbers)
print(rdd_numbers)

[1, 2, 3, 4, 5]
ParallelCollectionRDD[253] at parallelize at PythonRDD.scala:195


In [3]:
words = "nel mezzo del cammin di nostra vita".split(" ")
print(words)
rdd_words = sc.parallelize(words,2)
print(rdd_words)

['nel', 'mezzo', 'del', 'cammin', 'di', 'nostra', 'vita']
ParallelCollectionRDD[1] at parallelize at PythonRDD.scala:195


In [2]:
shakespeare_rdd = sc.textFile("hdfs://kddrtserver11.isti.cnr.it:9000/user/hpsa00/comedies.txt")

In [4]:
print(shakespeare_rdd)

hdfs://kddrtserver11.isti.cnr.it:9000/user/hpsa00/comedies.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0


In [6]:
p = shakespeare_rdd.collect()
print (p)



In [5]:
numbers = sc.parallelize([1, 2, 2, 2, 1, 1, 4, 3, 3, 5, 5])

In [7]:
numbers.collect()

[1, 2, 2, 2, 1, 1, 4, 3, 3, 5, 5]


In [8]:
numbers.first()

1

In [9]:
numbers.take(4)

[1, 2, 2, 2]

In [10]:
numbers.takeOrdered(4)

[1, 1, 1, 2]

In [11]:
withReplacement = True
numberToTake = 4
randomSeed = 123456
numbers.takeSample(withReplacement, numberToTake, randomSeed)

[4, 2, 2, 5]

In [17]:
numbers.reduce(lambda x, y: x + y)

29

In [18]:
distinct_numbers = numbers.distinct()

In [19]:
print(distinct_numbers.collect())

[4, 1, 5, 2, 3]


In [20]:
even_numbers = distinct_numbers.filter(lambda x: x % 2 == 0)
print(even_numbers.collect())

[4, 2]


In [21]:
data = sc.parallelize(range(20))

In [22]:
sampled_data = data.sample(withReplacement = False, fraction = 0.20)

In [26]:
print(sampled_data.collect())

[2, 10, 15]


In [27]:
data = sc.parallelize(range(10))

In [38]:
squared_data = data.map(lambda x: [x-2,x+2])

In [39]:
print(data.collect())
print(squared_data.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[[-2, 2], [-1, 3], [0, 4], [1, 5], [2, 6], [3, 7], [4, 8], [5, 9], [6, 10], [7, 11]]


In [40]:
squared_data = data.flatMap(lambda x: [x-2,x+2])

In [41]:
print(data.collect())
print(squared_data.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[-2, 2, -1, 3, 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11]


In [47]:
shakespeare_split_rdd = shakespeare_rdd.flatMap(lambda x: x.split())

In [48]:
print(shakespeare_split_rdd.collect())



In [49]:
words = sc.parallelize("nel mezzo del cammin di nostra vita".split(" "))

In [50]:
sorted_words = words.sortBy(lambda w: len(w))

In [52]:
print(sorted_words.collect())

['di', 'nel', 'del', 'vita', 'mezzo', 'cammin', 'nostra']


In [53]:
data1 = sc.parallelize(range(0,7))
data2 = sc.parallelize(range(3,10))

In [54]:
union = data1.union(data2)
print(union.collect())

[0, 1, 2, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9]


In [55]:
intersection = data1.intersection(data2)
print(intersection.collect())

[3, 4, 5, 6]


In [57]:
words = sc.parallelize("nel mezzo del cammin di nostra vita".split(" "))

In [59]:
print(words.collect())
keywords2 = words.keyBy(lambda w: w[0].upper())
print (keywords2.collect())

['nel', 'mezzo', 'del', 'cammin', 'di', 'nostra', 'vita']
[('N', 'nel'), ('M', 'mezzo'), ('D', 'del'), ('C', 'cammin'), ('D', 'di'), ('N', 'nostra'), ('V', 'vita')]


In [61]:
look = keywords2.lookup("N")

In [64]:
print(look)

['nel', 'nostra']


In [72]:
words = sc.parallelize("fare o non fare non esiste provare".split(" "))
word2 = words.map(lambda w: (w, 1))
print (word2.collect())
w = word2.reduceByKey(lambda x, y: x + y)
print (w.collect())

[('fare', 1), ('o', 1), ('non', 1), ('fare', 1), ('non', 1), ('esiste', 1), ('provare', 1)]
[('provare', 1), ('fare', 2), ('non', 2), ('esiste', 1), ('o', 1)]


In [74]:
sorted_wordcount = w.sortByKey()
print(sorted_wordcount.collect())

[('esiste', 1), ('fare', 2), ('non', 2), ('o', 1), ('provare', 1)]


In [75]:
cars = sc.parallelize(["Ferrari", "Porsche", "Mercedes"]) 
colors = sc.parallelize(["red", "black", "pink"])

In [76]:
joined = cars.cartesian(colors)
print(joined.collect())

[('Ferrari', 'red'), ('Ferrari', 'black'), ('Ferrari', 'pink'), ('Porsche', 'red'), ('Porsche', 'black'), ('Porsche', 'pink'), ('Mercedes', 'red'), ('Mercedes', 'black'), ('Mercedes', 'pink')]


In [78]:
cars = sc.parallelize([(1,"Ferrari"), (1, "Porsche"), (2, "Mercedes")]) 
colors = sc.parallelize([(1, "red"), (2, "black"), (3, "pink")])

In [80]:
joined = cars.join(colors)
print(joined.collect())

[(1, ('Ferrari', 'red')), (1, ('Porsche', 'red')), (2, ('Mercedes', 'black'))]


In [82]:
joined = cars.rightOuterJoin(colors)
print(joined.collect())

[(1, ('Ferrari', 'red')), (1, ('Porsche', 'red')), (2, ('Mercedes', 'black')), (3, (None, 'pink'))]


In [98]:
def create_bigrams(line): 
    pairs = []
    words = line.lower().split() 
    for i in range(len(words) -1):
        pairs.append(words[i] + "_" + words[i + 1]) 
    return pairs

In [99]:
text = sc.textFile("hdfs://kddrtserver11.isti.cnr.it:9000/user/hpsa00/comedies.txt")
bigrams = text.flatMap(create_bigrams)
ones = bigrams.map(lambda b: ( b, 1 ))
counts = ones.reduceByKey(lambda x, y: x + y) 
counts.saveAsTextFile("comedies_bigramscount.txt")

In [None]:
# devi aggiungere anche questo nel mezzo ma non ricordo dove

numbers = sc.parallelize(range(7))

keywords3 = words.zip(numbers)

print(words.collect())

print(numbers.collect())

print(keywords3.collect())