In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

SpSession = SparkSession \
  .builder \
  .appName("DBA - Spark Operations") \
  .getOrCreate()

SpContext = SpSession.sparkContext

In [0]:
inputPath = "/FileStore/tables/auto_data.csv"
autoData = SpContext.textFile(inputPath)
autoData.cache()

Out[2]: /FileStore/tables/auto_data.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [0]:
autoData.first()

Out[3]: 'MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE'

In [0]:
autoData.count()

Out[4]: 198

In [0]:
autoData.take(5)

Out[5]: ['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [0]:
for line in autoData.collect():
  print(line)

MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE
subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118
chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151
mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195
toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,37,41,5389
honda,gas,std,two,hatchback,fwd,four,60,5500,38,42,5399
nissan,gas,std,two,sedan,fwd,four,69,5200,31,37,5499
dodge,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
plymouth,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
mazda,gas,std,two,hatchback,fwd,four,68,5000,31,38,6095
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,31,38,6189
dodge,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
plymouth,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
chevrolet,gas,std,two,hatchback,fwd,four,70,5400,38,43,6295
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
dodge,gas,std,two,hatchback,fwd,four,68,5500,31,38,6377

In [0]:
outputPath = "auto-data-saved.csv"
autoDataFile = open(outputPath,"w")
autoDataFile.write("\n".join(autoData.collect()))
autoDataFile.close()

In [0]:
print("Transformations")
print("autoData count:", autoData.count())

Transformations
autoData count: 198


In [0]:
tsvData = autoData.map(lambda x : x.replace(",","\t"))
print("tsvData.take")
for i in tsvData.take(5):
  print(i)

tsvData.take
MAKE	FUELTYPE	ASPIRE	DOORS	BODY	DRIVE	CYLINDERS	HP	RPM	MPG-CITY	MPG-HWY	PRICE
subaru	gas	std	two	hatchback	fwd	four	69	4900	31	36	5118
chevrolet	gas	std	two	hatchback	fwd	three	48	5100	47	53	5151
mazda	gas	std	two	hatchback	fwd	four	68	5000	30	31	5195
toyota	gas	std	two	hatchback	fwd	four	62	4800	35	39	5348


In [0]:
toyotaData = autoData.filter(lambda x : "toyota" in x)
print(toyotaData.count())
for i in toyotaData.take(5):
  print(i)

32
toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
toyota,gas,std,four,hatchback,fwd,four,62,4800,31,38,6488
toyota,gas,std,four,wagon,fwd,four,62,4800,31,37,6918
toyota,gas,std,four,sedan,fwd,four,70,4800,30,37,6938


In [0]:
words = toyotaData.flatMap(lambda line : line.split(","))
print("words.count: ", words.count())
for i in words.take(20):
  print(i)

words.count:  384
toyota
gas
std
two
hatchback
fwd
four
62
4800
35
39
5348
toyota
gas
std
two
hatchback
fwd
four
62


In [0]:
for numbData in words.distinct().collect():
  print(numbData)

std
two
hatchback
fwd
four
5348
6338
sedan
70
diesel
56
7788
4wd
34
29
8238
hardtop
116
24
4200
8948
9258
112
6600
26
9298
9988
73
33
11248
20
15690
19
15750
161
15998
16558
convertible
toyota
gas
62
4800
35
39
31
38
6488
wagon
37
6918
30
6938
7198
47
7738
4500
27
32
7898
36
rwd
8058
28
8358
8449
8778
92
9538
9639
9989
turbo
10698
10898
11199
11549
six
156
5200
17669


In [0]:
for numbData in toyotaData.distinct().collect():
  print(numbData)

toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
toyota,gas,std,four,wagon,fwd,four,62,4800,31,37,6918
toyota,gas,std,four,wagon,4wd,four,62,4800,27,32,7898
toyota,diesel,std,four,sedan,fwd,four,56,4500,34,36,7898
toyota,gas,std,two,hardtop,rwd,four,116,4800,24,30,8449
toyota,gas,std,four,sedan,fwd,four,92,4200,29,34,8948
toyota,gas,std,two,sedan,rwd,four,112,6600,26,29,9298
toyota,gas,std,two,hatchback,rwd,four,112,6600,26,29,9538
toyota,gas,std,two,hardtop,rwd,four,116,4800,24,30,9639
toyota,gas,std,four,hatchback,fwd,four,92,4200,27,32,9988
toyota,gas,std,two,hatchback,rwd,four,116,4800,24,30,9989
toyota,diesel,turbo,four,sedan,fwd,four,73,4500,30,33,10698
toyota,gas,std,four,sedan,fwd,four,92,4200,27,32,10898
toyota,gas,std,two,hatchback,rwd,six,161,5200,20,24,16558
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
toyota,gas,std,four,hatchback,fwd,four,62,4800,31,38,6488
toyota,gas,std,four,sedan,fwd,four,70,4800,30,37,6938
toyota,gas,std,four,hatchback,fwd,four,70,

In [0]:
words1 = SpContext.parallelize(["hello","war","peace","world"])
words2 = SpContext.parallelize(["war","peace","universe"])

In [0]:
print("Union")
for union in words1.union(words2).distinct().collect():
  print(union)

Union
peace
hello
universe
world
war


In [0]:
print("Interseccion")
for intersects in words1.intersection(words2).collect():
  print(intersects)

Interseccion
peace
war


In [0]:
print("subtracs")
for subtracs in words1.subtract(words2).collect():
  print(subtracs)

subtracs
hello
world


In [0]:
def cleanRDD(autoStr):
  if isinstance(autoStr,int):
    return autoStr
  attList = autoStr.split(",")
  if attList[3] != "DOORS":
    if attList[3] != "two":
      attList[3]="2"
    else :
      attList[3]="4"
  attList[5] = attList[5].upper()
  return ",".join(attList)

cleanedData = autoData.map(cleanRDD)
for i in cleanedData.collect():
  print(i)

MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE
subaru,gas,std,4,hatchback,FWD,four,69,4900,31,36,5118
chevrolet,gas,std,4,hatchback,FWD,three,48,5100,47,53,5151
mazda,gas,std,4,hatchback,FWD,four,68,5000,30,31,5195
toyota,gas,std,4,hatchback,FWD,four,62,4800,35,39,5348
mitsubishi,gas,std,4,hatchback,FWD,four,68,5500,37,41,5389
honda,gas,std,4,hatchback,FWD,four,60,5500,38,42,5399
nissan,gas,std,4,sedan,FWD,four,69,5200,31,37,5499
dodge,gas,std,4,hatchback,FWD,four,68,5500,37,41,5572
plymouth,gas,std,4,hatchback,FWD,four,68,5500,37,41,5572
mazda,gas,std,4,hatchback,FWD,four,68,5000,31,38,6095
mitsubishi,gas,std,4,hatchback,FWD,four,68,5500,31,38,6189
dodge,gas,std,2,hatchback,FWD,four,68,5500,31,38,6229
plymouth,gas,std,2,hatchback,FWD,four,68,5500,31,38,6229
chevrolet,gas,std,4,hatchback,FWD,four,70,5400,38,43,6295
toyota,gas,std,4,hatchback,FWD,four,62,4800,31,38,6338
dodge,gas,std,4,hatchback,FWD,four,68,5500,31,38,6377
honda,gas,std,4,hatchback,FWD,fou