In [120]:
import polars as pl
import numpy as np
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from redis_om import JsonModel, HashModel
from pydantic import validator, BaseModel, root_validator, ByteSize
from io import BytesIO

In [2]:
data_dir = Path("data")
filenames = (data_dir/"byType").glob("*.csv")

In [3]:
dt = []
for file in filenames:
    dt.append(pl.read_csv(file))

In [4]:
dt[0]

Unnamed: 0_level_0,country,equipment_type,destroyed,abandoned,captured,damaged,type_total,row_id,Date
i64,str,str,i64,i64,i64,i64,i64,i64,str
1,"""Russia""","""All Types""",3304,327,1422,116,5169,1,"""2022-08-12"""
2,"""Russia""","""Tanks""",573,50,281,36,940,2,"""2022-08-12"""
3,"""Russia""","""Armoured Fight...",355,34,139,6,534,3,"""2022-08-12"""
4,"""Russia""","""Infantry Fight...",671,71,261,21,1024,4,"""2022-08-12"""
5,"""Russia""","""Armoured Perso...",71,10,58,3,142,5,"""2022-08-12"""
6,"""Russia""","""Mine-Resistant...",17,4,8,3,32,6,"""2022-08-12"""
7,"""Russia""","""Infantry Mobil...",73,4,35,2,114,7,"""2022-08-12"""
8,"""Russia""","""Command Posts ...",60,7,41,0,108,8,"""2022-08-12"""
9,"""Russia""","""Engineering Ve...",64,41,71,3,179,9,"""2022-08-12"""
10,"""Russia""","""Self-Propelled...",8,5,9,0,22,10,"""2022-08-12"""


In [5]:
stacked_russia = defaultdict(list)
stacked_ukraine = defaultdict(list)
for day in dt:
    date = datetime.strptime(day["Date"][0],'%Y-%m-%d')
    ru = day.filter(pl.col("country")=="Russia").select(["equipment_type", "type_total"]).to_dict(as_series=False)
    for equipment_type, number in zip(ru["equipment_type"], ru["type_total"]):
        stacked_russia[equipment_type].append((date, number))
    ukr = day.filter(pl.col("country")=="Ukraine").select(["equipment_type", "type_total"]).to_dict(as_series=False)
    for equipment_type, number in zip(ukr["equipment_type"], ukr["type_total"]):
        stacked_ukraine[equipment_type].append((date, number))

In [6]:
stacked_russia = dict(stacked_russia)
stacked_ukraine = dict(stacked_ukraine)

In [67]:
russia = []
for day in dt:
    date = day["Date"][0].replace("-","")
    ru = day.filter(pl.col("country")=="Russia").select(["equipment_type", "type_total"]).to_dict(as_series=False)
    russia.append({
        "date": date,
        **{eqp.replace(" ", "").replace("-", "").replace("\n","").replace(",",""):number for eqp, number in zip(ru["equipment_type"], ru["type_total"]) if eqp!="All Types"}
    })
russia

[{'date': '20220812',
  'Tanks': 940,
  'ArmouredFightingVehicles': 534,
  'InfantryFightingVehicles': 1024,
  'ArmouredPersonnelCarriers': 142,
  'MineResistantAmbushProtected': 32,
  'InfantryMobilityVehicles': 114,
  'CommandPostsAndCommunicationsStations': 108,
  'EngineeringVehiclesAndEquipment': 179,
  'SelfPropelledAntiTankMissileSystems': 22,
  'HeavyMortars': 14,
  'TowedArtillery': 75,
  'SelfPropelledArtillery': 152,
  'MultipleRocketLaunchers': 93,
  'AntiAircraftGuns': 7,
  'SelfPropelledAntiAircraftGuns': 17,
  'SurfaceToAirMissileSystems': 68,
  'Radars': 13,
  'JammersAndDeceptionSystems': 12,
  'Aircraft': 50,
  'Helicopters': 49,
  'UnmannedAerialVehicles': 105,
  'NavalShips': 11,
  'LogisticsTrains': 3,
  'TrucksVehiclesandJeeps': 1402},
 {'date': '20220225',
  'Tanks': 2,
  'ArmouredFightingVehicles': 7,
  'InfantryFightingVehicles': 4,
  'MineresistantAmbushProtected': 1,
  'InfantryMobilityVehicles': 5,
  'EngineeringVehicles': 3,
  'Mortars': 1,
  'Artillery': 4

In [68]:
russia_df = pl.DataFrame(russia).sort(by="date")
russia_df

date,Tanks,ArmouredFightingVehicles,InfantryFightingVehicles,ArmouredPersonnelCarriers,MineResistantAmbushProtected,InfantryMobilityVehicles,CommandPostsAndCommunicationsStations,EngineeringVehiclesAndEquipment,SelfPropelledAntiTankMissileSystems,HeavyMortars,TowedArtillery,SelfPropelledArtillery,MultipleRocketLaunchers,AntiAircraftGuns,SelfPropelledAntiAircraftGuns,SurfaceToAirMissileSystems,Radars,JammersAndDeceptionSystems,Aircraft,Helicopters,UnmannedAerialVehicles,NavalShips,LogisticsTrains,TrucksVehiclesandJeeps,MineresistantAmbushProtected,EngineeringVehicles,Mortars,Artillery,ArtillerySupportVehiclesAndEquipment,UnmannedCombatAerialVehicles,ReconnaissanceUnmannedAerialVehicles,CommunicationsVehicles,AntitankGuidedMissiles,CommunicationsStations,AntiTankGuidedMissiles,ManPortableAirDefenceSystems,CommunicationsStation,SelfpropelledAntiAircraftGuns
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""20220224""",1,5,1,,,2,,,,,,,,,,,,,2,2,,,,9,,3,,,,,,,,,,,,
"""20220225""",2,7,4,,,5,,,,,,,4,,,2,,,3,2,,,,19,1,3,1,4,,,,,,,,,,
"""20220226""",7,7,11,,,7,,,,,,,7,,,3,,,3,2,,,,37,2,12,1,4,,,,2,3,,,,,
"""20220227""",9,11,15,6,,11,,,,,,,7,,,4,,,3,3,,,,66,2,12,2,4,,,,2,4,,,,,
"""20220228""",41,31,43,14,3,20,,,,2,4,,8,,,13,,1,3,4,,,,108,,16,,,,,,2,5,,,1,,2
"""20220301""",44,36,48,14,3,22,,,,2,4,6,9,,,14,,1,3,6,,,2,117,,17,,,,,,,6,,,1,3,2
"""20220302""",55,49,62,24,3,24,,,,2,6,10,11,,,17,,1,4,7,1,,2,162,,23,,,,,,,18,,,1,5,3
"""20220303""",65,55,71,28,4,24,,,,2,8,13,13,,,17,,1,4,7,1,,2,177,,24,,,,,,,21,,,1,7,3
"""20220304""",91,65,80,33,4,25,,,,3,9,15,15,,,18,,1,6,8,1,,2,204,,26,,,,,,,26,,,3,8,7
"""20220305""",106,73,105,41,4,29,,,,3,9,15,16,,,20,,1,10,10,2,,2,224,,33,,,,,,,,8,29,3,,7


In [69]:
sorted_cols =list(russia_df[-1].transpose(include_header=True).drop_nulls().with_column(pl.col("column_0").cast(pl.Int32)).sort(by="column_0")["column"])
sorted_cols.insert(0, sorted_cols.pop())

In [70]:
processed = russia_df.select(sorted_cols,).sort(by="date")

In [72]:
processed.write_csv("russia.csv",)

In [73]:
datetime.strptime("20220224","%Y%m%d")

datetime.datetime(2022, 2, 24, 0, 0)

In [75]:
with open("test.avro",'wb') as fp:
    processed.write_avro(fp)

In [None]:
with BytesIO() as bt:
    processed.write_avro(bt)
    print(bt.getbuffer().tobytes())

In [181]:
from typing import Dict
class MyCSV(JsonModel):
    data:bytes

    class Config:
        arbitrary_types_allowed=True
        extra="ignore"
        json_encoders = {
            bytes: lambda v: v.hex()
        }
    
    def __repr__(self) -> str:
        return self.df.__repr__()
    
    @root_validator(pre=True)
    def root(cls, values:Dict):
        if "data" not in values:
            assert "df" in values, "a dataframe should be provided"
            df:pl.DataFrame = values.get("df")
            values["data"] = cls.serialize_df(df)
        elif isinstance(values["data"],str): # Data coming from redis db
            data:str = values.get("data")
            values["data"] = bytes.fromhex(data)
        return values
    
    @staticmethod
    def serialize_df(v:pl.DataFrame)->bytes:
        with BytesIO() as btio:
            v.write_avro(btio)
            return btio.getbuffer().tobytes()
    
    @staticmethod
    def deserialize_df(bt:bytes)->pl.DataFrame:
        with BytesIO() as btio:
            btio.write(bt)
            btio.seek(0)
            return pl.read_avro(btio)
    
    @property
    def df(self)->pl.DataFrame:
        return self.deserialize_df(self.data)

In [182]:
mycsv = MyCSV(df=processed)
mycsv

shape: (311, 25)
┌──────────┬────────────┬──────────┬────────────┬─────┬────────────┬───────┬────────────┬────────────┐
│ date     ┆ UnmannedCo ┆ NavalShi ┆ AntiAircra ┆ ... ┆ ArmouredFi ┆ Tanks ┆ InfantryFi ┆ TrucksVehi │
│ ---      ┆ mbatAerial ┆ ps       ┆ ftGuns     ┆     ┆ ghtingVehi ┆ ---   ┆ ghtingVehi ┆ clesandJee │
│ str      ┆ Vehicles   ┆ ---      ┆ ---        ┆     ┆ cles       ┆ i64   ┆ cles       ┆ ps         │
│          ┆ ---        ┆ i64      ┆ i64        ┆     ┆ ---        ┆       ┆ ---        ┆ ---        │
│          ┆ i64        ┆          ┆            ┆     ┆ i64        ┆       ┆ i64        ┆ i64        │
╞══════════╪════════════╪══════════╪════════════╪═════╪════════════╪═══════╪════════════╪════════════╡
│ 20220224 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 5          ┆ 1     ┆ 1          ┆ 9          │
│ 20220225 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 7          ┆ 2     ┆ 4          ┆ 19         │
│ 20220226 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 

In [183]:
mycsv.save()

shape: (311, 25)
┌──────────┬────────────┬──────────┬────────────┬─────┬────────────┬───────┬────────────┬────────────┐
│ date     ┆ UnmannedCo ┆ NavalShi ┆ AntiAircra ┆ ... ┆ ArmouredFi ┆ Tanks ┆ InfantryFi ┆ TrucksVehi │
│ ---      ┆ mbatAerial ┆ ps       ┆ ftGuns     ┆     ┆ ghtingVehi ┆ ---   ┆ ghtingVehi ┆ clesandJee │
│ str      ┆ Vehicles   ┆ ---      ┆ ---        ┆     ┆ cles       ┆ i64   ┆ cles       ┆ ps         │
│          ┆ ---        ┆ i64      ┆ i64        ┆     ┆ ---        ┆       ┆ ---        ┆ ---        │
│          ┆ i64        ┆          ┆            ┆     ┆ i64        ┆       ┆ i64        ┆ i64        │
╞══════════╪════════════╪══════════╪════════════╪═════╪════════════╪═══════╪════════════╪════════════╡
│ 20220224 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 5          ┆ 1     ┆ 1          ┆ 9          │
│ 20220225 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 7          ┆ 2     ┆ 4          ┆ 19         │
│ 20220226 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 

In [185]:
from commons.models.dataframe import AvroDF

df = AvroDF(df=processed)
df

shape: (311, 25)
┌──────────┬────────────┬──────────┬────────────┬─────┬────────────┬───────┬────────────┬────────────┐
│ date     ┆ UnmannedCo ┆ NavalShi ┆ AntiAircra ┆ ... ┆ ArmouredFi ┆ Tanks ┆ InfantryFi ┆ TrucksVehi │
│ ---      ┆ mbatAerial ┆ ps       ┆ ftGuns     ┆     ┆ ghtingVehi ┆ ---   ┆ ghtingVehi ┆ clesandJee │
│ str      ┆ Vehicles   ┆ ---      ┆ ---        ┆     ┆ cles       ┆ i64   ┆ cles       ┆ ps         │
│          ┆ ---        ┆ i64      ┆ i64        ┆     ┆ ---        ┆       ┆ ---        ┆ ---        │
│          ┆ i64        ┆          ┆            ┆     ┆ i64        ┆       ┆ i64        ┆ i64        │
╞══════════╪════════════╪══════════╪════════════╪═════╪════════════╪═══════╪════════════╪════════════╡
│ 20220224 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 5          ┆ 1     ┆ 1          ┆ 9          │
│ 20220225 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 7          ┆ 2     ┆ 4          ┆ 19         │
│ 20220226 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 

In [186]:
df.save()

shape: (311, 25)
┌──────────┬────────────┬──────────┬────────────┬─────┬────────────┬───────┬────────────┬────────────┐
│ date     ┆ UnmannedCo ┆ NavalShi ┆ AntiAircra ┆ ... ┆ ArmouredFi ┆ Tanks ┆ InfantryFi ┆ TrucksVehi │
│ ---      ┆ mbatAerial ┆ ps       ┆ ftGuns     ┆     ┆ ghtingVehi ┆ ---   ┆ ghtingVehi ┆ clesandJee │
│ str      ┆ Vehicles   ┆ ---      ┆ ---        ┆     ┆ cles       ┆ i64   ┆ cles       ┆ ps         │
│          ┆ ---        ┆ i64      ┆ i64        ┆     ┆ ---        ┆       ┆ ---        ┆ ---        │
│          ┆ i64        ┆          ┆            ┆     ┆ i64        ┆       ┆ i64        ┆ i64        │
╞══════════╪════════════╪══════════╪════════════╪═════╪════════════╪═══════╪════════════╪════════════╡
│ 20220224 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 5          ┆ 1     ┆ 1          ┆ 9          │
│ 20220225 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 7          ┆ 2     ┆ 4          ┆ 19         │
│ 20220226 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 

In [187]:
AvroDF.get(df.pk)

shape: (311, 25)
┌──────────┬────────────┬──────────┬────────────┬─────┬────────────┬───────┬────────────┬────────────┐
│ date     ┆ UnmannedCo ┆ NavalShi ┆ AntiAircra ┆ ... ┆ ArmouredFi ┆ Tanks ┆ InfantryFi ┆ TrucksVehi │
│ ---      ┆ mbatAerial ┆ ps       ┆ ftGuns     ┆     ┆ ghtingVehi ┆ ---   ┆ ghtingVehi ┆ clesandJee │
│ str      ┆ Vehicles   ┆ ---      ┆ ---        ┆     ┆ cles       ┆ i64   ┆ cles       ┆ ps         │
│          ┆ ---        ┆ i64      ┆ i64        ┆     ┆ ---        ┆       ┆ ---        ┆ ---        │
│          ┆ i64        ┆          ┆            ┆     ┆ i64        ┆       ┆ i64        ┆ i64        │
╞══════════╪════════════╪══════════╪════════════╪═════╪════════════╪═══════╪════════════╪════════════╡
│ 20220224 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 5          ┆ 1     ┆ 1          ┆ 9          │
│ 20220225 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 7          ┆ 2     ┆ 4          ┆ 19         │
│ 20220226 ┆ null       ┆ null     ┆ null       ┆ ... ┆ 