In [1]:
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

os.environ["SPARK_HOME"] = "/usr/local/spark"
os.environ["PYSPARK_PYTHON"] = "/home/pigidser/anaconda3/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"

spark = SparkSession.builder.master("local").appName("spark_test").getOrCreate()

In [2]:
# load dataframe
cdf = spark.read.format("csv") \
    .option("mode", "FAILFAST") \
    .option("inferSchema", "true") \
    .option("header","true") \
    .option("path", "countries_of_the_world.csv") \
    .load()

In [3]:
cdf.createOrReplaceTempView("countries")

In [4]:
# Простой select

spark.sql("select count(*) from countries").show()

+--------+
|count(1)|
+--------+
|     227|
+--------+



In [5]:
# Информация о таблице

spark.sql("describe table countries").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|             Country|   string|   null|
|              Region|   string|   null|
|          Population|      int|   null|
|      Area (sq. mi.)|      int|   null|
|Pop. Density (per...|   string|   null|
|Coastline (coast/...|   string|   null|
|       Net migration|   string|   null|
|Infant mortality ...|   string|   null|
|  GDP ($ per capita)|      int|   null|
|        Literacy (%)|   string|   null|
|   Phones (per 1000)|   string|   null|
|          Arable (%)|   string|   null|
|           Crops (%)|   string|   null|
|           Other (%)|   string|   null|
|             Climate|   string|   null|
|           Birthrate|   string|   null|
|           Deathrate|   string|   null|
|         Agriculture|   string|   null|
|            Industry|   string|   null|
|             Service|   string|   null|
+--------------------+---------+-------+



In [6]:
# Что с пробелами
# Посмотрим, как с пробелами в названиях стран и регионов

spark.sql("select distinct region,length(region) from countries").show(100,False)

+-----------------------------------+--------------+
|region                             |length(region)|
+-----------------------------------+--------------+
|EASTERN EUROPE                     |35            |
|OCEANIA                            |35            |
|SUB-SAHARAN AFRICA                 |35            |
|NORTHERN AMERICA                   |35            |
|NEAR EAST                          |35            |
|WESTERN EUROPE                     |35            |
|BALTICS                            |35            |
|ASIA (EX. NEAR EAST)               |29            |
|NORTHERN AFRICA                    |35            |
|C.W. OF IND. STATES                |20            |
|LATIN AMER. & CARIB                |23            |
+-----------------------------------+--------------+



In [8]:
# Поработаем смешанно - SQL + dataframe

spark.sql("select country,region,population from countries") \
    .filter(F.col('country').startswith('Russia')) \
    .show()

+-------+--------------------+----------+
|country|              region|population|
+-------+--------------------+----------+
|Russia |C.W. OF IND. STATES | 142893540|
+-------+--------------------+----------+



In [10]:
# Logical plans
# Посмотрим на логические планы выполнения SQL запроса и dataframe
# и убедимся в их "похожести" (найти различия, конечно, можно, но они - косметические).

sqlQ = spark.sql("select region,count(*) as ncountries from countries group by region order by ncountries desc")
dfQ = cdf.groupBy('Region').count().withColumnRenamed("count","ncountries").sort(F.desc('ncountries'))
dfQ.show()
sqlQ.show()
dfQ.explain()
sqlQ.explain()

+--------------------+----------+
|              Region|ncountries|
+--------------------+----------+
|SUB-SAHARAN AFRIC...|        51|
|LATIN AMER. & CAR...|        45|
|ASIA (EX. NEAR EA...|        28|
|WESTERN EUROPE   ...|        28|
|OCEANIA          ...|        21|
|NEAR EAST        ...|        16|
|EASTERN EUROPE   ...|        12|
|C.W. OF IND. STATES |        12|
|NORTHERN AFRICA  ...|         6|
|NORTHERN AMERICA ...|         5|
|BALTICS          ...|         3|
+--------------------+----------+

+--------------------+----------+
|              region|ncountries|
+--------------------+----------+
|SUB-SAHARAN AFRIC...|        51|
|LATIN AMER. & CAR...|        45|
|ASIA (EX. NEAR EA...|        28|
|WESTERN EUROPE   ...|        28|
|OCEANIA          ...|        21|
|NEAR EAST        ...|        16|
|EASTERN EUROPE   ...|        12|
|C.W. OF IND. STATES |        12|
|NORTHERN AFRICA  ...|         6|
|NORTHERN AMERICA ...|         5|
|BALTICS          ...|         3|
+------------

In [11]:
spark.stop()