In [25]:
import findspark
findspark.init()

In [26]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

### Method 1 for Creating SparkContext

In [42]:
pyspark = SparkSession.builder \
.master('local[4]') \
.appName('creatingRDDs') \
.config('spark.executor.memory', '4g') \
.config('spark.driver.memory', '2g') \
.getOrCreate()

In [43]:
sc = pyspark.sparkContext

In [29]:
sc.stop()

### Method 2 for Creating SparkContext

In [30]:
conf = SparkConf() \
.setMaster('local[4]') \
.setAppName('creatingRdds') \
.setExecutorEnv('spark.executor.memory', '4g') \
.setExecutorEnv('spark.driver.memory', '4g')

pyspark = SparkSession.builder \
.config(conf=conf) \
.getOrCreate()

In [31]:
sc = pyspark.sparkContext

In [32]:
sc.stop()

### Method 3 for Creating SparkContext

In [33]:
conf = SparkConf() \
.setMaster('local[4]') \
.setAppName('creatingRdds') \
.setExecutorEnv('spark.executor.memory', '4g') \
.setExecutorEnv('spark.driver.memory', '4g')

In [34]:
sc = SparkContext(conf=conf)

### Creating RDDs from Python lists

In [35]:
rdd1 = sc.parallelize([('Paul', 25), ('Jack', 29), ('Amanda', 38), ('Jane', 33)])

In [49]:
rdd1.take(2)

[('Paul', 25), ('Jack', 29)]

In [46]:
rdd1.count()

4

In [37]:
rddNumbers = sc.parallelize([[1,2,3], [4,5,6]])

In [48]:
rddNumbers.take(1)

[[1, 2, 3]]

In [47]:
rddNumbers.count()

2

### Creating RDDs from Python dictionaries

In [39]:
my_dict ={
    'Student':['Paul', 'Amanda', 'Jack'],
    'Points':[70,80,90]
}

In [40]:
import pandas as pd

df = pd.DataFrame(my_dict)

df.head()

Unnamed: 0,Student,Points
0,Paul,70
1,Amanda,80
2,Jack,90


In [44]:
rddFromDF = pyspark.createDataFrame(df)
rddFromDF.show()

+-------+------+
|Student|Points|
+-------+------+
|   Paul|    70|
| Amanda|    80|
|   Jack|    90|
+-------+------+



In [45]:
rddFromPandas = rddFromDF.rdd
rddFromPandas.take(2)

[Row(Student='Paul', Points=70), Row(Student='Amanda', Points=80)]

### Creating RDDs from Text Files

In [50]:
rdd_text = sc.textFile('Example.csv')

In [51]:
rdd_text.take(10)

['InvoiceNo;StockCode;Description;Quantity;InvoiceDate;UnitPrice;CustomerID;Country',
 '536365;85123A;WHITE HANGING HEART T-LIGHT HOLDER;6;1.12.2010 08:26;2,55;17850;United Kingdom',
 '536365;71053;WHITE METAL LANTERN;6;1.12.2010 08:26;3,39;17850;United Kingdom',
 '536365;84406B;CREAM CUPID HEARTS COAT HANGER;8;1.12.2010 08:26;2,75;17850;United Kingdom',
 '536365;84029G;KNITTED UNION FLAG HOT WATER BOTTLE;6;1.12.2010 08:26;3,39;17850;United Kingdom',
 '536365;84029E;RED WOOLLY HOTTIE WHITE HEART.;6;1.12.2010 08:26;3,39;17850;United Kingdom',
 '536365;22752;SET 7 BABUSHKA NESTING BOXES;2;1.12.2010 08:26;7,65;17850;United Kingdom',
 '536365;21730;GLASS STAR FROSTED T-LIGHT HOLDER;6;1.12.2010 08:26;4,25;17850;United Kingdom',
 '536366;22633;HAND WARMER UNION JACK;6;1.12.2010 08:28;1,85;17850;United Kingdom',
 '536366;22632;HAND WARMER RED POLKA DOT;6;1.12.2010 08:28;1,85;17850;United Kingdom']