In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql.types import StructType, StructField 
from pyspark.sql.types import StringType, IntegerType, ArrayType
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.functions import col
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[1]"))
spark = SparkSession(sc)

In [2]:
schema = StructType([
        StructField('fname', StringType(), True),
        StructField('lname', StringType(), True),
        StructField('courses', StringType(), True),
        StructField('grade', IntegerType(), True),
        StructField('year', IntegerType(), True)
        ])

In [3]:
data = spark.read.format('csv').options(header='false',delimiter='|').schema(schema).load("Dane/dane1.csv",header=False)
data.printSchema()
data.show()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- courses: string (nullable = true)
 |-- grade: integer (nullable = true)
 |-- year: integer (nullable = true)

+----------+----------+-----------+-----+----+
|     fname|     lname|    courses|grade|year|
+----------+----------+-----------+-----+----+
|  Karolina|    Kozieł|     FIZYKA|    1|   7|
|  Weronika|    Kapłon|     FIZYKA|    1|   5|
|   Izabela|    Snażyk|INFORMATYKA|    1|   5|
|       Leo| Brockhuis|     FIZYKA|    2|   2|
|    Alicja|    Kawala|     FIZYKA|    1|   7|
|   Bartosz|    Piętka|INFORMATYKA|    1|   3|
|     Dawid|  Pietruch|     FIZYKA|    1|   5|
|     Piotr|  Kukiełka|     FIZYKA|    2|   2|
| Stanisław|      Król|     FIZYKA|    2|   2|
|Franciszek|Kramarczyk|INFORMATYKA|    1|   5|
|Aleksandra|    Popiel|     FIZMED|    2|   2|
|     Kamil|   Tomczyk|INFORMATYKA|    1|   7|
|    Hubert|     Mazur|INFORMATYKA|    1|   5|
| Tymoteusz|      Kruk|INFORMATYKA|    2|   2|
|    Ro

In [4]:
data.where(data.fname == "Weronika").show(truncate=False)
data.where(~(data.fname == "Weronika")).show(truncate=False)
data.where(col("fname") == "Weronika").show(truncate=False)
data.select("lname").show(truncate=False)
data.select("lname","fname").show(truncate=False)
data.groupBy("grade","year").count().show(truncate=False)
data.groupBy("grade","year").count().orderBy("grade","year").show(truncate=False)

+--------+----------+-----------+-----+----+
|fname   |lname     |courses    |grade|year|
+--------+----------+-----------+-----+----+
|Weronika|Kapłon    |FIZYKA     |1    |5   |
|Weronika|Stanek    |FIZYKA     |1    |5   |
|Weronika|Wiszyńska |INFORMATYKA|1    |3   |
|Weronika|Mrozińska |FIZYKA     |2    |2   |
|Weronika|Szewczyk  |FIZMED     |1    |3   |
|Weronika|Szpytma   |INFORMATYKA|1    |7   |
|Weronika|Schabowicz|INFORMATYKA|1    |7   |
|Weronika|Pastuszka |FIZMED     |1    |7   |
|Weronika|Miszczak  |INFORMATYKA|1    |3   |
|Weronika|Tracz     |FIZMED     |1    |5   |
|Weronika|Ciurej    |INFORMATYKA|1    |3   |
+--------+----------+-----------+-----+----+

+----------+-----------+-----------+-----+----+
|fname     |lname      |courses    |grade|year|
+----------+-----------+-----------+-----+----+
|Karolina  |Kozieł     |FIZYKA     |1    |7   |
|Izabela   |Snażyk     |INFORMATYKA|1    |5   |
|Leo       |Brockhuis  |FIZYKA     |2    |2   |
|Alicja    |Kawala     |FIZYKA     |

In [5]:
data.createOrReplaceGlobalTempView("lista")
spark.sql("SELECT * from global_temp.lista").show()
spark.sql("SELECT grade, year, count(*) from global_temp.lista group by grade, year order by grade, year").show()

+----------+----------+-----------+-----+----+
|     fname|     lname|    courses|grade|year|
+----------+----------+-----------+-----+----+
|  Karolina|    Kozieł|     FIZYKA|    1|   7|
|  Weronika|    Kapłon|     FIZYKA|    1|   5|
|   Izabela|    Snażyk|INFORMATYKA|    1|   5|
|       Leo| Brockhuis|     FIZYKA|    2|   2|
|    Alicja|    Kawala|     FIZYKA|    1|   7|
|   Bartosz|    Piętka|INFORMATYKA|    1|   3|
|     Dawid|  Pietruch|     FIZYKA|    1|   5|
|     Piotr|  Kukiełka|     FIZYKA|    2|   2|
| Stanisław|      Król|     FIZYKA|    2|   2|
|Franciszek|Kramarczyk|INFORMATYKA|    1|   5|
|Aleksandra|    Popiel|     FIZMED|    2|   2|
|     Kamil|   Tomczyk|INFORMATYKA|    1|   7|
|    Hubert|     Mazur|INFORMATYKA|    1|   5|
| Tymoteusz|      Kruk|INFORMATYKA|    2|   2|
|    Robert|     Gałat|INFORMATYKA|    2|   2|
|    Patryk|     Śledź|INFORMATYKA|    1|   3|
|   Jadwiga|     Bizoń|     FIZMED|    1|   3|
|     Rafał| Tyczyński|     FIZMED|    1|   7|
|    Joanna| 

In [6]:
spark.sql("SELECT grade, year, count(*) as count from global_temp.lista where grade=1 group by grade, year order by count DESC").show()
spark.sql("SELECT count(*) as count from global_temp.lista").show()
spark.sql("SELECT fname from global_temp.lista group by fname").count()

+-----+----+-----+
|grade|year|count|
+-----+----+-----+
|    1|   3|  203|
|    1|   5|  139|
|    1|   7|  138|
+-----+----+-----+

+-----+
|count|
+-----+
|  576|
+-----+



136