In [23]:
from ural import normalize_url
import polars as pl
from datetime import datetime

In [24]:
media_urls = pl.scan_csv("corpus mediacloud_Lola_traditional_media.csv")

In [25]:
event_start_date = datetime(2022,10,14)

In [52]:
q1 = (
    pl.scan_csv("corpus mediacloud_Lola_traditional_media.csv", try_parse_dates=True)
    .with_columns([
        pl.col("url").apply(normalize_url, return_dtype=pl.Utf8).alias("normalized_url"),
        pl.col("language").str.to_uppercase() # Just to reproduce the user guide
    ])
    .filter(pl.col("publish_date") > event_start_date)
)


In [47]:
q1.schema

{'archived_url': Utf8,
 'article_url': Utf8,
 'id': Int64,
 'language': Utf8,
 'media_name': Utf8,
 'media_url': Utf8,
 'publish_date': Datetime(tu='us', tz=None),
 'title': Utf8,
 'url': Utf8}

In [70]:
q1.show_graph(optimized=False) #Apparently graphviz needs to be installed

ImportError: Graphviz dot binary should be on your PATH

In [74]:
print(q1.describe_plan()) # Deprecated

FILTER [(col("publish_date")) > (1665705600000000.strict_cast(Datetime(Microseconds, None)))] FROM WITH_COLUMNS:
 [col("url").map_list().alias("normalized_url"), col("language").str.uppercase()]

    CSV SCAN corpus mediacloud_Lola_traditional_media.csv
    PROJECT */9 COLUMNS


  print(q1.describe_plan()) # Deprecated


In [75]:
print(q1.explain())

 WITH_COLUMNS:
 [col("url").map_list().alias("normalized_url"), col("language").str.uppercase()]

    CSV SCAN corpus mediacloud_Lola_traditional_media.csv
    PROJECT */9 COLUMNS
    SELECTION: [(col("publish_date")) > (2022-10-14 00:00:00)]


In [76]:
print(q1.describe_optimized_plan()) 
#I don't see the difference: https://pola-rs.github.io/polars-book/user-guide/lazy-api/lazy-query-plan.html#optimized-query-plan

 WITH_COLUMNS:
 [col("url").map_list().alias("normalized_url"), col("language").str.uppercase()]

    CSV SCAN corpus mediacloud_Lola_traditional_media.csv
    PROJECT */9 COLUMNS
    SELECTION: [(col("publish_date")) > (2022-10-14 00:00:00)]


In [87]:
first_rows = (
    pl.scan_csv("corpus mediacloud_Lola_traditional_media.csv", try_parse_dates=True)
    .with_columns([
        pl.col("url").apply(normalize_url, return_dtype=pl.Utf8).alias("normalized_url"),
        pl.col("language").str.to_uppercase() # Just to reproduce the user guide
    ])
    .filter(pl.col("publish_date") > event_start_date)
    .fetch(n_rows=170)
)

In [88]:
first_rows

archived_url,article_url,id,language,media_name,media_url,publish_date,title,url,normalized_url
str,str,i64,str,str,str,datetime[μs],str,str,str
"""https://web.ar...","""https://waybac...",20221017210806,"""FR""","""bfmtv.com""","""http://bfmtv.c...",2022-10-15 00:00:00,"""EN DIRECT - Ru...","""https://www.bf...","""bfmtv.com/inte..."
"""https://web.ar...","""https://waybac...",20221017210808,"""FR""","""bfmtv.com""","""http://bfmtv.c...",2022-10-15 00:00:00,"""Météo: un temp...","""https://www.bf...","""bfmtv.com/mete..."
"""https://web.ar...","""https://waybac...",20221017210809,"""FR""","""bfmtv.com""","""http://bfmtv.c...",2022-10-15 00:00:00,"""Qu’est-ce que ...","""https://www.bf...","""bfmtv.com/tech..."
"""https://web.ar...","""https://waybac...",20221017210811,"""FR""","""bfmtv.com""","""http://bfmtv.c...",2022-10-15 00:00:00,"""Turquie: 28 mo...","""https://www.bf...","""bfmtv.com/inte..."
"""https://web.ar...","""https://waybac...",20221017210833,"""FR""","""bfmtv.com""","""http://bfmtv.c...",2022-10-15 00:00:00,"""Météo Nord-Pas...","""https://www.bf...","""bfmtv.com/gran..."
"""https://web.ar...","""https://waybac...",20221017211009,"""FR""","""bfmtv.com""","""http://bfmtv.c...",2022-10-15 00:00:00,"""Carburants: 27...","""https://www.bf...","""bfmtv.com/econ..."


In [89]:
all_dataset = (
    pl.scan_csv("corpus mediacloud_Lola_traditional_media.csv", try_parse_dates=True)
    .with_columns([
        pl.col("url").apply(normalize_url, return_dtype=pl.Utf8).alias("normalized_url"),
        pl.col("language").str.to_uppercase() # Just to reproduce the user guide
    ])
    .filter(pl.col("publish_date") > event_start_date)
    .collect(streaming=True)
)

In [90]:
all_dataset.shape

(6662, 10)