In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('RatingsHistogram')
sc = SparkContext(conf = conf)

lines = sc.textFile('ml-100k/u.data')#an RDD, not a Python list

ratings = lines.map(lambda x: x.split()[2])#split separates by whitespace
print(ratings)

result = ratings.countByValue()

sortedResults = sorted(result.items(), key=lambda x: x[0])

for (rating, count) in sortedResults:
    print(rating + ' ' + str(count))

In [None]:
from pyspark import SparkConf,SparkContext
conf = SparkConf().setMaster('local').setAppName('RatingsHistogram')
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age,numFriends)

lines = sc.textFile('fakefriends.csv')

rdd = lines.map(parseLine)

totalByAge = rdd.mapValues(lambda x: (x,1)).reduceByKey(
    lambda x, y: (x[0] + y[0], x[1] + y[1])) #added the '1' in order to get the count. reduceByKey sums up based on key 'age'
    
averagesByAge = totalByAge.mapValues(lambda x: x[0] / x[1])

print(averagesByAge)

results = averagesByAge.sortByKey().collect()

for result in results:
    print(result)

In [None]:
from pyspark import SparkConf,SparkContext
conf = SparkConf().setMaster('local').setAppName('RatingsHistogram')
sc = SparkContext(conf = conf)

lines = sc.textFile('ml-100k/u.data')
movies = lines.map(lambda x: (int(x.split()[1]), 1))
moviesCounts = movies.reduceByKey(lambda x, y : x+y)

flipped = moviesCounts.map(lambda z: (z[1],z[0]))

sortedMovies = flipped.sortByKey(ascending=False)

results = sortedMovies.collect()

for result in results:
    print(result)

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

import collections

spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

def mapper(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]), name=str(fields[1].encode(
        'utf-8')), age=int(fields[2]), numFriends=int(fields[3]))

lines = spark.sparkContext.textFile('fakefriends.csv')
people = lines.map(mapper)

schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView('people')

teenagers = spark.sql(
'SELECT * FROM people WHERE age >= 13 AND age <= 19')

for teen in teenagers.collect():
    print(teen)
    
schemaPeople.groupBy('age').count().orderBy('age').show()

spark.stop()

Row(ID=21, age=19, name="b'Miles'", numFriends=268)
Row(ID=52, age=19, name="b'Beverly'", numFriends=269)
Row(ID=54, age=19, name="b'Brunt'", numFriends=5)
Row(ID=106, age=18, name="b'Beverly'", numFriends=499)
Row(ID=115, age=18, name="b'Dukat'", numFriends=397)
Row(ID=133, age=19, name="b'Quark'", numFriends=265)
Row(ID=136, age=19, name="b'Will'", numFriends=335)
Row(ID=225, age=19, name="b'Elim'", numFriends=106)
Row(ID=304, age=19, name="b'Will'", numFriends=404)
Row(ID=341, age=18, name="b'Data'", numFriends=326)
Row(ID=366, age=19, name="b'Keiko'", numFriends=119)
Row(ID=373, age=19, name="b'Quark'", numFriends=272)
Row(ID=377, age=18, name="b'Beverly'", numFriends=418)
Row(ID=404, age=18, name="b'Kasidy'", numFriends=24)
Row(ID=409, age=19, name="b'Nog'", numFriends=267)
Row(ID=439, age=18, name="b'Data'", numFriends=417)
Row(ID=444, age=18, name="b'Keiko'", numFriends=472)
Row(ID=492, age=19, name="b'Dukat'", numFriends=36)
Row(ID=494, age=18, name="b'Kasidy'", numFriends=194)