# Apache Avro

In [1]:
!pip install avro-python3

Collecting avro-python3
  Downloading avro-python3-1.10.2.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: avro-python3
  Building wheel for avro-python3 (setup.py) ... [?25l[?25hdone
  Created wheel for avro-python3: filename=avro_python3-1.10.2-py3-none-any.whl size=43991 sha256=912006b758394e04c7ecab11f27bb2489e1327980f56d771eadcaab93640cf95
  Stored in directory: /root/.cache/pip/wheels/bc/85/62/6cdd81c56f923946b401cecff38055b94c9b766927f7d8ca82
Successfully built avro-python3
Installing collected packages: avro-python3
Successfully installed avro-python3-1.10.2


In [2]:
import copy
import json
import avro
from avro.io import DatumWriter, DatumReader
from avro.datafile import DataFileWriter, DataFileReader

In [3]:
schema = {
    'name': 'avro.example.User',
    'type': 'record',
    'fields': [
        {'name': 'name', 'type': 'string'},
        {'name': 'age', 'type': 'int'}
    ]
}

schema_parsed = avro.schema.Parse(json.dumps(schema))
schema_parsed.__str__()

'{"type": "record", "name": "User", "namespace": "avro.example", "fields": [{"type": "string", "name": "name"}, {"type": "int", "name": "age"}]}'

In [4]:
with open('users.avro', 'wb') as f:
    writer = DataFileWriter(f, DatumWriter(), schema_parsed)
    writer.append({'name': 'Pierre-Simon Laplace', 'age': 77})
    writer.append({'name': 'John von Neumann', 'age': 53})
    writer.close()


In [5]:
with open('users.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    users = [user for user in reader]
    reader.close()

print(f'Schema that we specified:\n {schema}')
print(f'Schema that we parsed:\n {schema_parsed}')
print(f'Schema from users.avro file:\n {schema_from_file}')
print(f'Users:\n {users}')

Schema that we specified:
 {'name': 'avro.example.User', 'type': 'record', 'fields': [{'name': 'name', 'type': 'string'}, {'name': 'age', 'type': 'int'}]}
Schema that we parsed:
 {"type": "record", "name": "User", "namespace": "avro.example", "fields": [{"type": "string", "name": "name"}, {"type": "int", "name": "age"}]}
Schema from users.avro file:
 {'type': 'record', 'name': 'User', 'namespace': 'avro.example', 'fields': [{'type': 'string', 'name': 'name'}, {'type': 'int', 'name': 'age'}]}
Users:
 [{'name': 'Pierre-Simon Laplace', 'age': 77}, {'name': 'John von Neumann', 'age': 53}]


In [6]:
schema_new = {
    'name': 'avro.example.User',
    'type': 'record',
    'fields': [
        {'name': 'name', 'type': 'string'},
        {'name': 'age', 'type': 'int'},
        {'name': 'favoriteNumber', 'type': 'int'}
    ]
}

with open('users_new.avro', 'wb') as f:
    writer = DataFileWriter(f, DatumWriter(), avro.schema.Parse(json.dumps(schema_new)))
    writer.append({'name': 'Pierre-Simon Laplace', 'age': 77, 'favoriteNumber': 1337})
    writer.append({'name': 'John von Neumann', 'age': 53, 'favoriteNumber': 228})
    writer.close()

In [7]:
with open('users_new.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader(schema_parsed, schema_parsed))
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    users = [user for user in reader]
    reader.close()

print(f'Schema that we specified:\n {schema}')
print(f'Schema that we parsed:\n {schema_parsed}')
print(f'Schema from users.avro file:\n {schema_from_file}')
print(f'Users:\n {users}')

Schema that we specified:
 {'name': 'avro.example.User', 'type': 'record', 'fields': [{'name': 'name', 'type': 'string'}, {'name': 'age', 'type': 'int'}]}
Schema that we parsed:
 {"type": "record", "name": "User", "namespace": "avro.example", "fields": [{"type": "string", "name": "name"}, {"type": "int", "name": "age"}]}
Schema from users.avro file:
 {'type': 'record', 'name': 'User', 'namespace': 'avro.example', 'fields': [{'type': 'string', 'name': 'name'}, {'type': 'int', 'name': 'age'}, {'type': 'int', 'name': 'favoriteNumber'}]}
Users:
 [{'name': 'Pierre-Simon Laplace', 'age': 77}, {'name': 'John von Neumann', 'age': 53}]


In [8]:
schema_new = {
    'name': 'avro.example.User',
    'type': 'record',
    'fields': [
        {'name': 'name', 'type': 'string'},
        {'name': 'age', 'type': 'string'},
        {'name': 'favoriteNumber', 'type': 'int'}
    ]
}

with open('users_corrupted.avro', 'wb') as f:
    writer = DataFileWriter(f, DatumWriter(), avro.schema.Parse(json.dumps(schema_new)))
    writer.append({'name': 'Pierre-Simon Laplace', 'age': '77', 'favoriteNumber': 1337})
    writer.append({'name': 'John von Neumann', 'age': '53', 'favoriteNumber': 228})
    writer.close()

In [9]:
with open('users_corrupted.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader(schema_parsed, schema_parsed))
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    users = [user for user in reader]
    reader.close()

print(f'Schema that we specified:\n {schema}')
print(f'Schema that we parsed:\n {schema_parsed}')
print(f'Schema from users.avro file:\n {schema_from_file}')
print(f'Users:\n {users}')

SchemaResolutionException: ignored

In [10]:
with open('users_corrupted.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    users = [user for user in reader]
    reader.close()

print(f'Schema that we specified:\n {schema}')
print(f'Schema that we parsed:\n {schema_parsed}')
print(f'Schema from users.avro file:\n {schema_from_file}')
print(f'Users:\n {users}')

Schema that we specified:
 {'name': 'avro.example.User', 'type': 'record', 'fields': [{'name': 'name', 'type': 'string'}, {'name': 'age', 'type': 'int'}]}
Schema that we parsed:
 {"type": "record", "name": "User", "namespace": "avro.example", "fields": [{"type": "string", "name": "name"}, {"type": "int", "name": "age"}]}
Schema from users.avro file:
 {'type': 'record', 'name': 'User', 'namespace': 'avro.example', 'fields': [{'type': 'string', 'name': 'name'}, {'type': 'string', 'name': 'age'}, {'type': 'int', 'name': 'favoriteNumber'}]}
Users:
 [{'name': 'Pierre-Simon Laplace', 'age': '77', 'favoriteNumber': 1337}, {'name': 'John von Neumann', 'age': '53', 'favoriteNumber': 228}]


---

# Apache Parquet

In [13]:
!pip install pyarrow



In [42]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [43]:
weather_schema = pa.schema([
    ('city', pa.string()),
    ('measurement_time', pa.timestamp('ms')),
    ('temperature', pa.float32()),
    ('atmospheric_pressure', pa.float32())
])

In [44]:
from datetime import datetime

In [45]:
df = pd.DataFrame({
    'city': ['New York', 'London', 'Tokyo'],
    'measurement_time': [
        datetime(2022, 5, 1, 12, 0, 0),
        datetime(2022, 5, 1, 13, 0, 0),
        datetime(2022, 5, 1, 14, 0, 0)],
    'temperature': [20.5, 15.2, 23.1],
    'atmospheric_pressure': [101.5, 99.2, 100.1]
})

table = pa.Table.from_pandas(df, schema=weather_schema)

pq.write_table(table, 'weather_data.parquet')

In [46]:
# Define the output path for the Parquet dataset
output_path = 'weather_partitioned_data'

# Write the data to a Parquet dataset partitioned by city
pq.write_to_dataset(
    table,
    root_path=output_path,
    partition_cols=['city']
)

---

# Compressing dataset

https://www.kaggle.com/datasets/stefanoleone992/ea-sports-fc-24-complete-player-dataset/

In [11]:
!unzip archive.zip

Archive:  archive.zip
   creating: archive/
  inflating: __MACOSX/._archive      
  inflating: archive/female_teams.csv  
  inflating: __MACOSX/archive/._female_teams.csv  
  inflating: archive/male_teams.csv  
  inflating: __MACOSX/archive/._male_teams.csv  
  inflating: archive/female_players.csv  
  inflating: __MACOSX/archive/._female_players.csv  
  inflating: archive/male_coaches.csv  
  inflating: __MACOSX/archive/._male_coaches.csv  
  inflating: archive/female_coaches.csv  
  inflating: __MACOSX/archive/._female_coaches.csv  
  inflating: archive/male_players.csv  
  inflating: __MACOSX/archive/._male_players.csv  


In [47]:
import pandas as pd

big_df = pd.read_csv('archive/male_players.csv')
big_df

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,update_as_of,short_name,long_name,player_positions,overall,potential,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,2,https://sofifa.com/player/2/giovanni-pasquale/...,15.0,2.0,2014-09-18,G. Pasquale,Giovanni Pasquale,LM,68,68,...,69-1,69-1,69-1,71-3,72-4,70-2,70-2,70-2,72-4,13
1,11,https://sofifa.com/player/11/romain-rocchi/150002,15.0,2.0,2014-09-18,R. Rocchi,Romain Rocchi,"CM, CDM",68,68,...,65,65,65,62,60,62,62,62,60,12
2,27,https://sofifa.com/player/27/joe-cole/150002,15.0,2.0,2014-09-18,J. Cole,Joe Cole,"RM, CAM",74,74,...,59,59,59,56,50,48,48,48,50,12
3,28,https://sofifa.com/player/28/manuel-herrera-ya...,15.0,2.0,2014-09-18,Manu Herrera,Manuel Herrera Yagüe,GK,76,76,...,32,32,32,31,31,34,34,34,31,76
4,41,https://sofifa.com/player/41/andres-iniesta-lu...,15.0,2.0,2014-09-18,Iniesta,Andrés Iniesta Luján,"CM, LW",89,89,...,77+3,77+3,77+3,76+3,70+3,63+3,63+3,63+3,70+3,14+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180016,278132,/player/278132/rakan-al-kaabi/240002,24.0,2.0,2023-09-22,R. Al Kaabi,Rakan Al Kaabi,"CDM, CM",54,66,...,53+2,53+2,53+2,53+2,54+2,54+2,54+2,54+2,54+2,13+2
180017,278138,/player/278138/josh-keyes/240002,24.0,2.0,2023-09-22,J. Keyes,Josh Keyes,CM,51,65,...,48+2,48+2,48+2,49+2,48+2,44+2,44+2,44+2,48+2,14+2
180018,278139,/player/278139/rodrigo-frutos/240002,24.0,2.0,2023-09-22,R. Frutos,Rodrigo Frutos,GK,58,66,...,29+2,29+2,29+2,24+2,23+2,24+2,24+2,24+2,23+2,57+2
180019,278141,/player/278141/christian-bos/240002,24.0,2.0,2023-09-22,C. Bos,Christian Bos,RB,55,70,...,49+2,49+2,49+2,51+2,53+2,52+2,52+2,52+2,53+2,13+2


In [48]:
big_df = big_df[[
    'player_id', 'player_url', 'fifa_version',
    'fifa_update', 'update_as_of', 'short_name',
    'long_name', 'player_positions', 'overall',
    'potential'
]]
big_df

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,update_as_of,short_name,long_name,player_positions,overall,potential
0,2,https://sofifa.com/player/2/giovanni-pasquale/...,15.0,2.0,2014-09-18,G. Pasquale,Giovanni Pasquale,LM,68,68
1,11,https://sofifa.com/player/11/romain-rocchi/150002,15.0,2.0,2014-09-18,R. Rocchi,Romain Rocchi,"CM, CDM",68,68
2,27,https://sofifa.com/player/27/joe-cole/150002,15.0,2.0,2014-09-18,J. Cole,Joe Cole,"RM, CAM",74,74
3,28,https://sofifa.com/player/28/manuel-herrera-ya...,15.0,2.0,2014-09-18,Manu Herrera,Manuel Herrera Yagüe,GK,76,76
4,41,https://sofifa.com/player/41/andres-iniesta-lu...,15.0,2.0,2014-09-18,Iniesta,Andrés Iniesta Luján,"CM, LW",89,89
...,...,...,...,...,...,...,...,...,...,...
180016,278132,/player/278132/rakan-al-kaabi/240002,24.0,2.0,2023-09-22,R. Al Kaabi,Rakan Al Kaabi,"CDM, CM",54,66
180017,278138,/player/278138/josh-keyes/240002,24.0,2.0,2023-09-22,J. Keyes,Josh Keyes,CM,51,65
180018,278139,/player/278139/rodrigo-frutos/240002,24.0,2.0,2023-09-22,R. Frutos,Rodrigo Frutos,GK,58,66
180019,278141,/player/278141/christian-bos/240002,24.0,2.0,2023-09-22,C. Bos,Christian Bos,RB,55,70


In [52]:
big_df.to_csv('big_df.csv', index=False)

In [53]:
!wc -c big_df.csv

22145807 big_df.csv


In [50]:
big_df.to_json('big_df.json')

In [51]:
!wc -c big_df.json

40418168 big_df.json


In [33]:
big_df.columns

Index(['player_id', 'player_url', 'fifa_version', 'fifa_update',
       'update_as_of', 'short_name', 'long_name', 'player_positions',
       'overall', 'potential'],
      dtype='object')

In [54]:
schema_new = {
    'name': 'avro.example.Player',
    'type': 'record',
    'fields': [
        {'name': 'player_id', 'type': 'int'},
        {'name': 'player_url', 'type': 'string'},
        {'name': 'fifa_version', 'type': 'float'},
        {'name': 'fifa_update', 'type': 'float'},
        {'name': 'update_as_of', 'type': 'string'},
        {'name': 'short_name', 'type': 'string'},
        {'name': 'long_name', 'type': 'string'},
        {'name': 'player_positions', 'type': 'string'},
        {'name': 'overall', 'type': 'int'},
        {'name': 'potential', 'type': 'int'},
    ]
}

In [55]:
with open('players.avro', 'wb') as f:
    writer = DataFileWriter(f, DatumWriter(), avro.schema.Parse(json.dumps(schema_new)))
    for index, row in big_df.iterrows():
        writer.append({i: row[i] for i in big_df.columns})
    writer.close()



In [56]:
!wc -c players.avro

20665547 players.avro


In [57]:
big_df.to_parquet('players.parquet')

In [58]:
!wc -c players.parquet

7169300 players.parquet


In [None]:
20665547
 7169300