In [None]:
#!pip install pyspark

#Criando a sessão do SparkContext e SparkSession

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession.builder.appName('PySpark Dataframe from RDD').getOrCreate()

#Criando PySpark DataFrame de um RDD existente

In [5]:
#Criou tabela com valores dentro onde a primeira coluna eh um caracter
rdd = sc.parallelize([('C',85,76,87,91), ('B',85,76,87,91), ('A',85, 78,96,92),('A',92,76,89,96)],4)

In [10]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


In [7]:
sub = ['id_person','value_1','value_2','value_3','value_4']

In [8]:
marks_df = spark.createDataFrame(rdd, schema=sub)

In [9]:
print(type(marks_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [11]:
marks_df.printSchema()

root
 |-- id_person: string (nullable = true)
 |-- value_1: long (nullable = true)
 |-- value_2: long (nullable = true)
 |-- value_3: long (nullable = true)
 |-- value_4: long (nullable = true)



In [12]:
marks_df.show()

+---------+-------+-------+-------+-------+
|id_person|value_1|value_2|value_3|value_4|
+---------+-------+-------+-------+-------+
|        C|     85|     76|     87|     91|
|        B|     85|     76|     87|     91|
|        A|     85|     78|     96|     92|
|        A|     92|     76|     89|     96|
+---------+-------+-------+-------+-------+



# Criando e manipulando dados em df pyspark

In [15]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pysparkdf').getOrCreate()

#importando dados

In [17]:
df = spark.read.csv('/content/drive/MyDrive/FIAP/Fase 03/cereal.csv' , sep=',' , inferSchema = True, header = True)

#Lendo o Schema

In [18]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)



#Select()

In [19]:
df.select('name','mfr', 'rating').show()

+--------------------+---+---------+
|                name|mfr|   rating|
+--------------------+---+---------+
|           100% Bran|  N|68.402973|
|   100% Natural Bran|  Q|33.983679|
|            All-Bran|  K|59.425505|
|All-Bran with Ext...|  K|93.704912|
|      Almond Delight|  R|34.384843|
|Apple Cinnamon Ch...|  G|29.509541|
|         Apple Jacks|  K|33.174094|
|             Basic 4|  G|37.038562|
|           Bran Chex|  R|49.120253|
|         Bran Flakes|  P|53.313813|
|        Cap'n'Crunch|  Q|18.042851|
|            Cheerios|  G|50.764999|
|Cinnamon Toast Cr...|  G|19.823573|
|            Clusters|  G|40.400208|
|         Cocoa Puffs|  G|22.736446|
|           Corn Chex|  R|41.445019|
|         Corn Flakes|  K|45.863324|
|           Corn Pops|  K|35.782791|
|       Count Chocula|  G|22.396513|
|  Cracklin' Oat Bran|  K|40.448772|
+--------------------+---+---------+
only showing top 20 rows



# witchColumn()

In [20]:
df.withColumn('Calories',df['calories'].cast('Integer')).printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- Calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)



# groupBy

In [21]:
df.groupBy('name','calories').count().show()

+--------------------+--------+-----+
|                name|calories|count|
+--------------------+--------+-----+
|Just Right Fruit ...|     140|    1|
|         Raisin Bran|     120|    1|
|Shredded Wheat sp...|      90|    1|
|           Corn Pops|     110|    1|
|  Honey Nut Cheerios|     110|    1|
|Muesli Raisins; D...|     150|    1|
|      Fruity Pebbles|     110|    1|
|           100% Bran|      70|    1|
|       Fruitful Bran|     120|    1|
|         Puffed Rice|      50|    1|
|      Raisin Squares|      90|    1|
|   Total Raisin Bran|     140|    1|
|      Golden Grahams|     110|    1|
|   Nutri-grain Wheat|      90|    1|
|   100% Natural Bran|     120|    1|
|Apple Cinnamon Ch...|     110|    1|
|Mueslix Crispy Blend|     160|    1|
|Shredded Wheat 'n...|      90|    1|
|              Smacks|     110|    1|
|      Quaker Oatmeal|     100|    1|
+--------------------+--------+-----+
only showing top 20 rows



# orderBy()

In [22]:
df.orderBy('protein').show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|        Cap'n'Crunch|  Q|   C|     120|      1|  2|   220|  0.0| 12.0|    12|    35|      25|    2|   1.0|0.75|18.042851|
|Cinnamon Toast Cr...|  G|   C|     120|      1|  3|   210|  0.0| 13.0|     9|    45|      25|    2|   1.0|0.75|19.823573|
|         Cocoa Puffs|  G|   C|     110|      1|  1|   180|  0.0| 12.0|    13|    55|      25|    2|   1.0| 1.0|22.736446|
|           Corn Pops|  K|   C|     110|      1|  0|    90|  1.0| 13.0|    12|    20|      25|    2|   1.0| 1.0|35.782791|
|       Count Chocula|  G|   C|     110|      1|  1|   180|  0.0| 12.0|    13|    65|      25|    2|   1.0| 1.0|22.396513|
|      Frosted F

#Case When

In [23]:
from pyspark.sql.functions import when

In [27]:
df.select('name', 'vitamins', when(df.vitamins >= '25', 'rich in vitamins')).show()

+--------------------+--------+----------------------------------------------------+
|                name|vitamins|CASE WHEN (vitamins >= 25) THEN rich in vitamins END|
+--------------------+--------+----------------------------------------------------+
|           100% Bran|      25|                                    rich in vitamins|
|   100% Natural Bran|       0|                                                NULL|
|            All-Bran|      25|                                    rich in vitamins|
|All-Bran with Ext...|      25|                                    rich in vitamins|
|      Almond Delight|      25|                                    rich in vitamins|
|Apple Cinnamon Ch...|      25|                                    rich in vitamins|
|         Apple Jacks|      25|                                    rich in vitamins|
|             Basic 4|      25|                                    rich in vitamins|
|           Bran Chex|      25|                                  

#filter()

In [28]:
df.filter(df.calories == '100').show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|         Corn Flakes|  K|   C|     100|      2|  0|   290|  1.0| 21.0|     2|    35|      25|    1|   1.0| 1.0|45.863324|
|Cream of Wheat (Q...|  N|   H|     100|      3|  0|    80|  1.0| 21.0|     0|    -1|       0|    2|   1.0| 1.0|64.533816|
|Crispy Wheat & Ra...|  G|   C|     100|      2|  1|   140|  2.0| 11.0|    10|   120|      25|    3|   1.0|0.75|36.176196|
|         Double Chex|  R|   C|     100|      2|  0|   190|  1.0| 18.0|     5|    80|      25|    3|   1.0|0.75|44.330856|
| Frosted Mini-Wheats|  K|   C|     100|      3|  0|     0|  3.0| 14.0|     7|   100|      25|    2|   1.0| 0.8|58.345141|
|        Golden 

#isNull() e isNotNull()

In [30]:
from pyspark.sql.functions import *

In [31]:
df.filter(df.name.isNotNull()).show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
|Apple Cinnamon 

#