In [1]:
import pyspark
from pyspark import SparkContext

## Spark Context

In [2]:
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/26 23:44:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
nums = sc.parallelize([1, 2, 3, 4, 5])
nums.take(1)

                                                                                

[1]

In [4]:
squared = nums.map(lambda x: x**x).collect()
for num in squared:
    print(num)

1
4
27
256
3125


## SQLContext

In [5]:
from pyspark.sql import SQLContext, Row

In [6]:
sqlContext = SQLContext(sc)



In [9]:
lst = [('John',19),('Smith',29),('Adam',35),('Henry',50)]
rdd = sc.parallelize(lst)
ppl = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
df = sqlContext.createDataFrame(ppl)

In [10]:
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+-----+---+
| name|age|
+-----+---+
| John| 19|
|Smith| 29|
| Adam| 35|
|Henry| 50|
+-----+---+



## Machine Learning Example with PySpark


In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkFiles

ss = (
    SparkSession
    .builder
    .getOrCreate()
)
sc = ss.sparkContext


In [10]:
CSV_URL = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(CSV_URL)

22/09/29 00:48:17 WARN SparkContext: The path https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv has been added already. Overwriting of added paths is not supported in the current version.


In [13]:
csv_file = SparkFiles.get("adult_data.csv")
adult_df = ss.read.csv(csv_file, header=True, inferSchema=True)

print("adult_df count", adult_df.count())
adult_df.printSchema()

adult_df count 48842
root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [15]:
adult_df.show(5, False)

+---+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|x  |age|workclass|fnlwgt|education   |educational-num|marital-status    |occupation       |relationship|race |gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|1  |25 |Private  |226802|11th        |7              |Never-married     |Machine-op-inspct|Own-child   |Black|Male  |0           |0           |40            |United-States |<=50K |
|2  |38 |Private  |89814 |HS-grad     |9              |Married-civ-spouse|Farming-fishing  |Husband     |White|Male  |0           |0           |50            |United-States |<=50K |
|3  |28 |Local-gov|336951|Assoc-acdm  |12             |Married-civ-spouse|Protective-serv 

In [17]:
# count by education
(adult_df
    .groupBy("education")
    .count()
    .sort("count", ascending=False)
    .show(100, False)
)

+------------+-----+
|education   |count|
+------------+-----+
|HS-grad     |15784|
|Some-college|10878|
|Bachelors   |8025 |
|Masters     |2657 |
|Assoc-voc   |2061 |
|11th        |1812 |
|Assoc-acdm  |1601 |
|10th        |1389 |
|7th-8th     |955  |
|Prof-school |834  |
|9th         |756  |
|12th        |657  |
|Doctorate   |594  |
|5th-6th     |509  |
|1st-4th     |247  |
|Preschool   |83   |
+------------+-----+



In [24]:
# get a summary statistics
adult_df \
    .describe("age") \
    .show()

adult_df \
    .select("age") \
    .distinct() \
    .orderBy("age") \
    .show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|             48842|
|   mean| 38.64358543876172|
| stddev|13.710509934443525|
|    min|                17|
|    max|                90|
+-------+------------------+

+---+
|age|
+---+
| 17|
| 18|
| 19|
| 20|
| 21|
| 22|
| 23|
| 24|
| 25|
| 26|
| 27|
| 28|
| 29|
| 30|
| 31|
| 32|
| 33|
| 34|
| 35|
| 36|
+---+
only showing top 20 rows



In [28]:
# filter
adult_df.filter(adult_df.age > 20).count()

45219

In [48]:
# group data and compute staistical operations like mean
(
    adult_df
    .groupBy("hours-per-week")
    .agg({
        "age": "mean"
    })
    .withColumnRenamed("avg(age)", "avg_age")
    .withColumn("hours_per_day", col("hours-per-week") / 7)
    .orderBy("avg_age", ascending=False)
    .select("hours-per-week", "hours_per_day", "avg_age")
    .show()
)

+--------------+-------------------+------------------+
|hours-per-week|      hours_per_day|           avg_age|
+--------------+-------------------+------------------+
|            79| 11.285714285714286|              64.0|
|            74| 10.571428571428571|              62.0|
|            82| 11.714285714285714|              61.0|
|             1|0.14285714285714285|57.111111111111114|
|             2| 0.2857142857142857|54.490566037735846|
|             3|0.42857142857142855|52.728813559322035|
|            67|  9.571428571428571|              51.5|
|            59|  8.428571428571429|48.857142857142854|
|            81| 11.571428571428571|48.333333333333336|
|             6| 0.8571428571428571| 48.17391304347826|
|             5| 0.7142857142857143| 47.77894736842105|
|             4| 0.5714285714285714| 46.82142857142857|
|             7|                1.0| 46.37777777777778|
|            86| 12.285714285714286|              46.0|
|            31|  4.428571428571429|45.833333333

In [49]:
# add age square to target DataFrame
adult_df = adult_df.withColumn("age_square", col("age") ** 2)
adult_df.printSchema()

root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- age_square: double (nullable = true)



In [58]:
# change columns order
COLUMNS = ('x',
 'age',
 'age_square',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income')

adult_df = adult_df.select(*COLUMNS)
adult_df.first()


Row(x=1, age=25, age_square=625.0, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K')

In [60]:
# filter data

(
    adult_df
    .groupBy("native-country")
    .agg({
        "native-country": "count"
    })
    .sort(asc("count(native-country)"))
    .show()
)

# remove some rows
_adult_df_remove = adult_df[adult_df["native-country"] != "Holand-Netherlands"]
_adult_df_remove \
    .select("native-country") \
    .where(col("native-country") == "Holand-Netherlands") \
    .show()

+--------------------+---------------------+
|      native-country|count(native-country)|
+--------------------+---------------------+
|  Holand-Netherlands|                    1|
|             Hungary|                   19|
|            Honduras|                   20|
|            Scotland|                   21|
|          Yugoslavia|                   23|
|Outlying-US(Guam-...|                   23|
|                Laos|                   23|
|     Trinadad&Tobago|                   27|
|            Cambodia|                   28|
|                Hong|                   30|
|            Thailand|                   30|
|             Ireland|                   37|
|              France|                   38|
|             Ecuador|                   45|
|                Peru|                   46|
|              Greece|                   49|
|           Nicaragua|                   49|
|                Iran|                   59|
|              Taiwan|                   65|
|         