In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [3]:
pyspark = SparkSession.builder \
.master('local[4]') \
.appName('AveSalariesbyOcc') \
.config('spark.executor.memory', '4g') \
.config('spark.driver.memory', '2g') \
.getOrCreate()

In [4]:
sc = pyspark.sparkContext

In [6]:
data2 = sc.textFile('simple_data.csv')

In [7]:
data2.take(3)

['sirano,isim,yas,meslek,sehir,aylik_gelir',
 '1,Cemal,35,Isci,Ankara,3500',
 '2,Ceyda,42,Memur,Kayseri,4200']

In [8]:
data = data2.filter(lambda x: 'sirano' not in x)

In [9]:
data.take(3)

['1,Cemal,35,Isci,Ankara,3500',
 '2,Ceyda,42,Memur,Kayseri,4200',
 '3,Timur,30,M�zisyen,Istanbul,9000']

In [11]:
def occupation_salary(line):
    occupation = line.split(',')[3]
    salary = float(line.split(',')[5])
    
    return (occupation, salary)

In [13]:
occ_sal_pairRDD = data.map(occupation_salary)
occ_sal_pairRDD.take(10)

[('Isci', 3500.0),
 ('Memur', 4200.0),
 ('M�zisyen', 9000.0),
 ('Pazarlamaci', 4200.0),
 ('Pazarlamaci', 4800.0),
 ('Memur', 4250.0),
 ('Pazarlamaci', 7300.0),
 ('M�zisyen', 12000.0),
 ('Doktor', 18000.0),
 ('Berber', 12000.0)]

In [15]:
occupation_salary = occ_sal_pairRDD.mapValues(lambda x: (x,1))
occupation_salary.take(10)

[('Isci', (3500.0, 1)),
 ('Memur', (4200.0, 1)),
 ('M�zisyen', (9000.0, 1)),
 ('Pazarlamaci', (4200.0, 1)),
 ('Pazarlamaci', (4800.0, 1)),
 ('Memur', (4250.0, 1)),
 ('Pazarlamaci', (7300.0, 1)),
 ('M�zisyen', (12000.0, 1)),
 ('Doktor', (18000.0, 1)),
 ('Berber', (12000.0, 1))]

In [16]:
occupation_salary_RBK = occupation_salary.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))
occupation_salary_RBK.take(10)

[('Memur', (12200.0, 3)),
 ('Pazarlamaci', (16300.0, 3)),
 ('Tuhafiyeci', (4800.0, 1)),
 ('Isci', (3500.0, 1)),
 ('M�zisyen', (29700.0, 3)),
 ('Doktor', (32250.0, 2)),
 ('Berber', (12000.0, 1)),
 ('Tornac�', (4200.0, 1))]

In [17]:
occupation_average_salary = occupation_salary_RBK.mapValues(lambda x: x[0]/x[1])
occupation_average_salary.take(10)

[('Memur', 4066.6666666666665),
 ('Pazarlamaci', 5433.333333333333),
 ('Tuhafiyeci', 4800.0),
 ('Isci', 3500.0),
 ('M�zisyen', 9900.0),
 ('Doktor', 16125.0),
 ('Berber', 12000.0),
 ('Tornac�', 4200.0)]