In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, row_number, max
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.window import Window
from pathlib import Path

In [2]:
conf = SparkConf().setAppName("tsv-file-import-example")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
def shape(df):
    return (df.count(), len(df.columns))

#### Carregando arquivo exemplo IMDb

In [4]:
%%time
! hadoop fs -put ../datasets/imdb/title.akas.tsv

put: `title.akas.tsv': File exists
CPU times: user 68.9 ms, sys: 35.8 ms, total: 105 ms
Wall time: 2.82 s


In [5]:
! hadoop fs -ls

Found 3 items
drwxr-xr-x   - root supergroup          0 2023-04-04 00:30 .sparkStaging
-rw-r--r--   2 root supergroup  311498305 2023-04-04 00:24 artist
-rw-r--r--   2 root supergroup 1763456390 2023-04-03 23:33 title.akas.tsv


In [6]:
df_title = spark.read.format(
    "csv"
).option(
    "header", "true"
).option(
    "delimiter", "\t"
).load(
    "title.akas.tsv"
)

In [7]:
shape(df_title)

(35514712, 8)

In [8]:
df_title_br = df_title.filter("region = 'BR'")

In [9]:
shape(df_title_br)

(117095, 8)

In [10]:
df_title_br.collect()[0]

Row(titleId='tt0000010', ordering='2', title='A Saída dos Operários da Fábrica Lumière', region='BR', language='\\N', types='\\N', attributes='\\N', isOriginalTitle='0')

#### Carregando arquivo exemplo MusicBrainz

In [27]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("gid", StringType(), True),
    StructField("name", StringType(), True),
    StructField("sort_name", StringType(), True),
    StructField("begin_date_year", IntegerType(), True),
    StructField("begin_date_month", IntegerType(), True),
    StructField("begin_date_day", IntegerType(), True),
    StructField("end_date_year", IntegerType(), True),
    StructField("end_date_month", IntegerType(), True),
    StructField("end_date_day", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("area", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("comment", StringType(), True),
    StructField("edits_pending", StringType(), True),
    StructField("last_updated", StringType(), True),
    StructField("ended", StringType(), True),
])

In [28]:
%%time
! hadoop fs -put ../datasets/music-brainz/mbdump/artist

put: `artist': File exists
CPU times: user 30.1 ms, sys: 43.2 ms, total: 73.3 ms
Wall time: 2.4 s


In [29]:
df_artist = spark.read.format(
    "csv"
).option(
    "header", "false"
).schema(
    schema
).option(
    "delimiter", "\t"
).load(
    "artist"
)

In [30]:
shape(df_artist)

(2142744, 17)

In [31]:
df_artist.select(
    "id", "name", "sort_name", "begin_date_year", "area", "gender"
).filter(
    "begin_date_year >= 2000"
).show(50)

+-------+--------------------+--------------------+---------------+-----+------+
|     id|                name|           sort_name|begin_date_year| area|gender|
+-------+--------------------+--------------------+---------------+-----+------+
|2133822|              Arcada|              Arcada|           2012|  167|    \N|
| 886710|        The Trippers|       Trippers, The|           2000|  221|    \N|
|1007384|          Limbo Kids|          Limbo Kids|           2012|  221|    \N|
|1805018|  Give Him 6 Podcast|  Give Him 6 Podcast|           2017|  303|     4|
|1509774|               JANAJ|               JANAJ|           2014|80913|    \N|
| 889105|             The Kik|            Kik, The|           2011|  150|    \N|
|2447568|              Dargor|              Dargor|           2018|  170|    \N|
|1335578|             Dendera|             Dendera|           2008|  221|    \N|
| 883032|      Broken & Burnt|      Broken & Burnt|           2011|   30|    \N|
|1808035|      Conwaythewhal