### Importando e inicializando spark

In [15]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, regexp_replace, trim, col
from pyspark.sql import functions as f
from pyspark.sql.types import DoubleType, StringType
from datetime import datetime

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

23/07/26 18:09:45 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.1.68 instead (on interface wlp0s20f3)
23/07/26 18:09:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/26 18:09:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

### Criando Dataframe movies

In [16]:
movies = spark.read.csv(
    "/home/mariannabaldezgomes/Dev/Pessoal/dataset-reviw-movies/dataset/movies.csv",
    header=True,
    sep=",",
    encoding="latin1",
    inferSchema=True
)


In [5]:
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

### Renomeando colunas dataframe

In [17]:
rename_movies_columns = {'movieId': 'id', 'title': 'titulo', 'genres':'genero'}

for name, rename in rename_movies_columns.items():
    movies = movies.withColumnRenamed(name, rename)

In [7]:
movies.show()

+---+--------------------+--------------------+
| id|              titulo|              genero|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Adventure|Animati...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|Comedy|Drama|Romance|
|  5|Father of the Bri...|              Comedy|
|  6|         Heat (1995)|Action|Crime|Thri...|
|  7|      Sabrina (1995)|      Comedy|Romance|
|  8| Tom and Huck (1995)|  Adventure|Children|
|  9| Sudden Death (1995)|              Action|
| 10|    GoldenEye (1995)|Action|Adventure|...|
| 11|American Presiden...|Comedy|Drama|Romance|
| 12|Dracula: Dead and...|       Comedy|Horror|
| 13|        Balto (1995)|Adventure|Animati...|
| 14|        Nixon (1995)|               Drama|
| 15|Cutthroat Island ...|Action|Adventure|...|
| 16|       Casino (1995)|         Crime|Drama|
| 17|Sense and Sensibi...|       Drama|Romance|
| 18|   Four Rooms (1995)|              

### Comandos dataframe viewer

In [8]:
movies.count() # retorna quantidade de linhas do dataframe

9742

In [9]:
movies.limit(5).toPandas() # retorna primeiras 5(ou quanto or definido) linhas do dataframe formatadas em pandas

Unnamed: 0,id,titulo,genero
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
movies.printSchema() # Retorna type das colunas dos dataframes

root
 |-- id: integer (nullable = true)
 |-- titulo: string (nullable = true)
 |-- genero: string (nullable = true)



### Separando ano de lançamento em uma coluna (antes ficava no conteudo juntamente ao nome do filme na coluna titulo)

In [18]:
regex_pattern = r"\((\d{4})\)"

In [19]:
movies = movies.withColumn('ano de lancamento', regexp_extract('titulo', regex_pattern, 1))

In [20]:
movies = movies.withColumn('titulo', regexp_replace('titulo', regex_pattern, ''))
# removendo espaços em branco antes ou depois do título
movies = movies.withColumn('titulo', trim('titulo'))

#### Formata ano de lancamento dos filmes de string para datetime

In [13]:
movies = movies.withColumn(
    'ano de lancamento',
    f.to_date(movies['ano de lancamento'].cast(StringType()), "yyy"))
movies.select('ano de lancamento').show(5)

+-----------------+
|ano de lancamento|
+-----------------+
|       1995-01-01|
|       1995-01-01|
|       1995-01-01|
|       1995-01-01|
|       1995-01-01|
+-----------------+
only showing top 5 rows



### Tranforma tipo de coluna ano de lancamento de string para int

In [23]:
movies = movies.withColumn(
    'ano de lancamento',
    col('ano de lancamento').cast('int'))
movies.select('ano de lancamento').show(5)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'Column'

### Selecionando informações

In [22]:
movies.select('*').show(5, False) # False como parametro no método show mostra a o conteudo completo, ajustando a largura das colunas

+---+---------------------------+-------------------------------------------+-----------------+
|id |titulo                     |genero                                     |ano de lancamento|
+---+---------------------------+-------------------------------------------+-----------------+
|1  |Toy Story                  |Adventure|Animation|Children|Comedy|Fantasy|1995             |
|2  |Jumanji                    |Adventure|Children|Fantasy                 |1995             |
|3  |Grumpier Old Men           |Comedy|Romance                             |1995             |
|4  |Waiting to Exhale          |Comedy|Drama|Romance                       |1995             |
|5  |Father of the Bride Part II|Comedy                                     |1995             |
+---+---------------------------+-------------------------------------------+-----------------+
only showing top 5 rows



In [16]:
movies.select('titulo', 'genero').show(5) # Visualizando colunas especificas

+--------------------+--------------------+
|              titulo|              genero|
+--------------------+--------------------+
|           Toy Story|Adventure|Animati...|
|             Jumanji|Adventure|Childre...|
|    Grumpier Old Men|      Comedy|Romance|
|   Waiting to Exhale|Comedy|Drama|Romance|
|Father of the Bri...|              Comedy|
+--------------------+--------------------+
only showing top 5 rows



### Identificando valores nulos

In [21]:
movies.select([f.count(f.when(f.isnull(c), 1)).alias(c) for c in movies.columns]).show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/mariannabaldezgomes/Dev/Pessoal/dataset-reviw-movies/venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mariannabaldezgomes/Dev/Pessoal/dataset-reviw-movies/venv/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

### Ordenando os dados

In [14]:
movies.select('titulo', f.year('ano de lancamento')).orderBy('ano de lancamento', acending=False).show(10, False)

+---------------------------------------------------+-----------------------+
|titulo                                             |year(ano de lancamento)|
+---------------------------------------------------+-----------------------+
|The Adventures of Sherlock Holmes and Doctor Watson|null                   |
|Babylon 5                                          |null                   |
|Ready Player One                                   |null                   |
|Hyena Road                                         |null                   |
|Nocturnal Animals                                  |null                   |
|Paterson                                           |null                   |
|Moonlight                                          |null                   |
|The OA                                             |null                   |
|Cosmos                                             |null                   |
|Maria Bamford: Old Baby                            |null       