# Spark and Social Media
***
<a href='https://github.com/pick1'> <img src='sparkjupyter.png' /></a>
***

## Importing Spark

In [1]:
from pyspark import SparkConf, SparkContext
import collections

## Setting up Spark

In [4]:
conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

## Loading the Dataset

In [5]:
lines = sc.textFile("fakefriends.csv")

## Function for slicing the dataset for Ages and Friend Count

In [6]:
def lineparser(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

## Setting up new RDD (Spark Dataframe)

In [7]:
rdd = lines.map(lineparser)

## Getting Totals by Age
**.mapValues leaves the keys untouched. Transforms friends info into tuple with keeping age untouched and adding a '1' for adding in .reduceByKey.**

** Next, .reduceByKey adds the total number of friends per age & the total instances of an age.**

In [13]:
%timeit 
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

## Getting Averages by Age

In [15]:
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])

11.5 µs ± 149 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Go Do Something Spark!

In [18]:
results = averagesByAge.collect()
for result in results:
    print(result)

(18, 343.375)
(19, 213.27272727272728)
(20, 165.0)
(21, 350.875)
(22, 206.42857142857142)
(23, 246.3)
(24, 233.8)
(25, 197.45454545454547)
(26, 242.05882352941177)
(27, 228.125)
(28, 209.1)
(29, 215.91666666666666)
(30, 235.8181818181818)
(31, 267.25)
(32, 207.9090909090909)
(33, 325.3333333333333)
(34, 245.5)
(35, 211.625)
(36, 246.6)
(37, 249.33333333333334)
(38, 193.53333333333333)
(39, 169.28571428571428)
(40, 250.8235294117647)
(41, 268.55555555555554)
(42, 303.5)
(43, 230.57142857142858)
(44, 282.1666666666667)
(45, 309.53846153846155)
(46, 223.69230769230768)
(47, 233.22222222222223)
(48, 281.4)
(49, 184.66666666666666)
(50, 254.6)
(51, 302.14285714285717)
(52, 340.6363636363636)
(53, 222.85714285714286)
(54, 278.0769230769231)
(55, 295.53846153846155)
(56, 306.6666666666667)
(57, 258.8333333333333)
(58, 116.54545454545455)
(59, 220.0)
(60, 202.71428571428572)
(61, 256.22222222222223)
(62, 220.76923076923077)
(63, 384.0)
(64, 281.3333333333333)
(65, 298.2)
(66, 276.4444444444444