In [1]:
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

os.environ["SPARK_HOME"] = "/usr/local/spark"
os.environ["PYSPARK_PYTHON"] = "/home/pigidser/anaconda3/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"

spark = SparkSession.builder.master("local").appName("spark_test").getOrCreate()

In [2]:
# load dataframe
cdf = spark.read.format("csv") \
    .option("mode", "FAILFAST") \
    .option("inferSchema", "true") \
    .option("header","true") \
    .option("path", "countries_of_the_world.csv") \
    .load()

In [3]:
# show schema
cdf.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Population: integer (nullable = true)
 |-- Area (sq. mi.): integer (nullable = true)
 |-- Pop. Density (per sq. mi.): string (nullable = true)
 |-- Coastline (coast/area ratio): string (nullable = true)
 |-- Net migration: string (nullable = true)
 |-- Infant mortality (per 1000 births): string (nullable = true)
 |-- GDP ($ per capita): integer (nullable = true)
 |-- Literacy (%): string (nullable = true)
 |-- Phones (per 1000): string (nullable = true)
 |-- Arable (%): string (nullable = true)
 |-- Crops (%): string (nullable = true)
 |-- Other (%): string (nullable = true)
 |-- Climate: string (nullable = true)
 |-- Birthrate: string (nullable = true)
 |-- Deathrate: string (nullable = true)
 |-- Agriculture: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Service: string (nullable = true)



In [7]:
# list of columns
cdf.columns

['Country',
 'Region',
 'Population',
 'Area (sq. mi.)',
 'Pop. Density (per sq. mi.)',
 'Coastline (coast/area ratio)',
 'Net migration',
 'Infant mortality (per 1000 births)',
 'GDP ($ per capita)',
 'Literacy (%)',
 'Phones (per 1000)',
 'Arable (%)',
 'Crops (%)',
 'Other (%)',
 'Climate',
 'Birthrate',
 'Deathrate',
 'Agriculture',
 'Industry',
 'Service']

In [16]:
cdf.describe().show()

+-------+------------+--------------------+--------------------+------------------+--------------------------+----------------------------+--------------------+----------------------------------+------------------+------------+-----------------+------------------+------------------+------------------+------------------+------------------+---------+-----------+--------+-------+
|summary|     Country|              Region|          Population|    Area (sq. mi.)|Pop. Density (per sq. mi.)|Coastline (coast/area ratio)|       Net migration|Infant mortality (per 1000 births)|GDP ($ per capita)|Literacy (%)|Phones (per 1000)|        Arable (%)|         Crops (%)|         Other (%)|           Climate|         Birthrate|Deathrate|Agriculture|Industry|Service|
+-------+------------+--------------------+--------------------+------------------+--------------------------+----------------------------+--------------------+----------------------------------+------------------+------------+-------------

In [17]:
# filter columns
cdf.select('Country','Region','Population')

DataFrame[Country: string, Region: string, Population: int]

In [22]:
# create a new column as concatenation of two others
cdf.selectExpr('Country','Region','Population','concat(Country,"in ",Region) as CountryIn').show(3, truncate=False)

+------------+-----------------------------------+----------+----------------------------------------------+
|Country     |Region                             |Population|CountryIn                                     |
+------------+-----------------------------------+----------+----------------------------------------------+
|Afghanistan |ASIA (EX. NEAR EAST)               |31056997  |Afghanistan in ASIA (EX. NEAR EAST)           |
|Albania     |EASTERN EUROPE                     |3581655   |Albania in EASTERN EUROPE                     |
|Algeria     |NORTHERN AFRICA                    |32930091  |Algeria in NORTHERN AFRICA                    |
+------------+-----------------------------------+----------+----------------------------------------------+
only showing top 3 rows



In [25]:
# drop columns
cdf.drop('Country','Region','Population').columns

['Area (sq. mi.)',
 'Pop. Density (per sq. mi.)',
 'Coastline (coast/area ratio)',
 'Net migration',
 'Infant mortality (per 1000 births)',
 'GDP ($ per capita)',
 'Literacy (%)',
 'Phones (per 1000)',
 'Arable (%)',
 'Crops (%)',
 'Other (%)',
 'Climate',
 'Birthrate',
 'Deathrate',
 'Agriculture',
 'Industry',
 'Service']

In [29]:
# add a new column with a static text
import pyspark.sql.functions as F
cdf.select('Country','Region','Population').withColumn("ONE",F.lit(1)).columns

['Country', 'Region', 'Population', 'ONE']

In [31]:
# rename a column
cdf.withColumnRenamed('Area (sq. mi.)','AREA').columns

['Country',
 'Region',
 'Population',
 'AREA',
 'Pop. Density (per sq. mi.)',
 'Coastline (coast/area ratio)',
 'Net migration',
 'Infant mortality (per 1000 births)',
 'GDP ($ per capita)',
 'Literacy (%)',
 'Phones (per 1000)',
 'Arable (%)',
 'Crops (%)',
 'Other (%)',
 'Climate',
 'Birthrate',
 'Deathrate',
 'Agriculture',
 'Industry',
 'Service']

In [35]:
# filter rows (contains, startsWith)
cdf.select('Country','Region').filter(F.col('Country').contains('ru')).show()

+--------+--------------------+
| Country|              Region|
+--------+--------------------+
|  Aruba |LATIN AMER. & CAR...|
|Belarus |C.W. OF IND. STATES |
| Brunei |ASIA (EX. NEAR EA...|
|Burundi |SUB-SAHARAN AFRIC...|
| Cyprus |NEAR EAST        ...|
|  Nauru |OCEANIA          ...|
|   Peru |LATIN AMER. & CAR...|
|Uruguay |LATIN AMER. & CAR...|
+--------+--------------------+



In [39]:
# create a simple DataFrame
df = spark.range(10).withColumn("ONE",F.lit(1)).withColumn("TWO",F.lit(2)).show()

+---+---+---+
| id|ONE|TWO|
+---+---+---+
|  0|  1|  2|
|  1|  1|  2|
|  2|  1|  2|
|  3|  1|  2|
|  4|  1|  2|
|  5|  1|  2|
|  6|  1|  2|
|  7|  1|  2|
|  8|  1|  2|
|  9|  1|  2|
+---+---+---+



In [40]:
# show unique values
cdf.select('Region').distinct().show(truncate=False)

+-----------------------------------+
|Region                             |
+-----------------------------------+
|BALTICS                            |
|C.W. OF IND. STATES                |
|ASIA (EX. NEAR EAST)               |
|WESTERN EUROPE                     |
|NORTHERN AMERICA                   |
|NEAR EAST                          |
|EASTERN EUROPE                     |
|OCEANIA                            |
|SUB-SAHARAN AFRICA                 |
|NORTHERN AFRICA                    |
|LATIN AMER. & CARIB                |
+-----------------------------------+



In [41]:
# length of a column
cdf.select('Region').distinct().withColumn('len',F.length('Region')).show(100,truncate=False)

+-----------------------------------+---+
|Region                             |len|
+-----------------------------------+---+
|BALTICS                            |35 |
|C.W. OF IND. STATES                |20 |
|ASIA (EX. NEAR EAST)               |29 |
|WESTERN EUROPE                     |35 |
|NORTHERN AMERICA                   |35 |
|NEAR EAST                          |35 |
|EASTERN EUROPE                     |35 |
|OCEANIA                            |35 |
|SUB-SAHARAN AFRICA                 |35 |
|NORTHERN AFRICA                    |35 |
|LATIN AMER. & CARIB                |23 |
+-----------------------------------+---+



In [44]:
# monotonically_increasing_id(), alias(), replace()
regRepl = { 'OCEANIA': 'OCN', 'ASIA (EX. NEAR EAST)': 'ASA' }
cdf.select(F.monotonically_increasing_id().alias('ID'),'Country',F.expr('rtrim(Region) as RegCode')) \
    .replace(regRepl,None,'RegCode') \
    .show()

+---+------------------+-------------------+
| ID|           Country|            RegCode|
+---+------------------+-------------------+
|  0|      Afghanistan |                ASA|
|  1|          Albania |     EASTERN EUROPE|
|  2|          Algeria |    NORTHERN AFRICA|
|  3|   American Samoa |                OCN|
|  4|          Andorra |     WESTERN EUROPE|
|  5|           Angola | SUB-SAHARAN AFRICA|
|  6|         Anguilla |LATIN AMER. & CARIB|
|  7|Antigua & Barbuda |LATIN AMER. & CARIB|
|  8|        Argentina |LATIN AMER. & CARIB|
|  9|          Armenia |C.W. OF IND. STATES|
| 10|            Aruba |LATIN AMER. & CARIB|
| 11|        Australia |                OCN|
| 12|          Austria |     WESTERN EUROPE|
| 13|       Azerbaijan |C.W. OF IND. STATES|
| 14|     Bahamas, The |LATIN AMER. & CARIB|
| 15|          Bahrain |          NEAR EAST|
| 16|       Bangladesh |                ASA|
| 17|         Barbados |LATIN AMER. & CARIB|
| 18|          Belarus |C.W. OF IND. STATES|
| 19|     

In [45]:
spark.stop()