In [2]:
import polars as pl
import matplotlib.pyplot as plt
import zipfile
from datetime import datetime

In [None]:
#List of the timeperiod under study
listDetailedMonthsAprSept=[("april", "2024", "04", 1, 15), ("april", "2024", "04", 16, 30),
                   ("may", "2024", "05", 1, 15), ("may", "2024", "05", 16, 31),
                   ("june", "2024", "06", 1, 15), ("june", "2024", "06", 16, 30),
                   ("july", "2024", "07", 1, 15),("july", "2024", "07", 16, 31),
                   ("august", "2024", "08", 1, 15), ("august", "2024", "08", 16, 31),
                   ("september", "2024", "09", 1, 15),("september", "2024", "09", 16, 22),("september", "2024", "09", 23, 30),
                   ("october", "2024", "10", 1, 6)]

# Figure 10 - Article 16 notices

In [None]:
#This function converts a given df's str column to datetime
def convertStrToDatetime(df, datecolumnstr):   
    return df.with_columns(df[datecolumnstr].str.to_datetime())

#This function scans zipped CSV files and collects only the rows that match col=wantedvalue (given in the parameters)
def filterTimePeriod(platform, listMonths, col, wantedvalue): 
    #this function scans the csvs logged on a given day and returns a filtered df along a given variable in a given column
    def filterDay(platform, year, month, day, col, wantedvalue) -> pl.DataFrame:    
        day = str(day).zfill(2)     #converts day(int) as day(string) with two numbers (01 instead of 1)
        filename = f"/Volumes/Paresseux/{platform}/sor-{platform.lower()}-{year}-{month}-{day}-full.zip"
        mother = zipfile.ZipFile(filename)
        dfs = [
            pl.scan_csv(child.open(csvfilename)).filter(pl.col(col)==wantedvalue).collect()
            for childname in mother.namelist()
            for child in [zipfile.ZipFile(mother.open(childname))]
            for csvfilename in child.namelist()
            ]
        return pl.concat(dfs)

    #this function automates filterDay over a month
    def filterMonth(platform, year, month, firstDayMonth, lastDayMonth, col, wantedvalue): 
        df=pl.DataFrame()  
        df=pl.concat([filterDay(platform, year, month, day, col, wantedvalue) 
                                        for day in range(firstDayMonth, lastDayMonth+1)])
        return df

    df=pl.DataFrame()
    for month in listMonths:
        print(f"{month} running")
        df_month=filterMonth(platform, month[1], month[2], month[3], month[4], col, wantedvalue)
        df=pl.concat([df, df_month])
    return df

In [None]:
#choose either Facebook or Instagram as data

#FACEBOOK DATA
df_16=filterTimePeriod("Facebook", listDetailedMonthsAprSept, "source_type", "SOURCE_ARTICLE_16")

#INSTAGRAM DATA
df_16=filterTimePeriod("Facebook", listDetailedMonthsAprSept, "source_type", "SOURCE_ARTICLE_16")

df_16=convertStrToDatetime(df_16, "application_date")
df16_righttime=df_16.filter((pl.col("application_date")>=datetime(2024, 4, 1)).and_(pl.col("application_date")<=datetime(2024, 9, 30)))

In [None]:
pl.Config(tbl_rows=14) #to be able to visualise the 14 rows

#column 3: "incompatible content" as ground and "removed" as sanction
df_col3= df16_righttime.filter((pl.col("decision_ground")=="DECISION_GROUND_INCOMPATIBLE_CONTENT").and_(pl.col("decision_visibility")=="[\"DECISION_VISIBILITY_CONTENT_REMOVED\"]"))
df_col3["category"].len() #total
df_col3["category"].value_counts().sort(by="category") #total per category

#column 4: "illegal content" as ground
df_col4=df16_righttime.filter(pl.col("decision_ground")=="DECISION_GROUND_ILLEGAL_CONTENT")
df_col4["category"].len() #total
df_col4["category"].value_counts().sort(by="category") #total per category

# Figure 11 - Types of sanction

In [15]:
#this function takes a df and drops the hours in datetimecolumn (datetime->date)
def dropHours(df, datetimecolumn):    
    return df.with_columns(pl.col(datetimecolumn).dt.date())

#this function scans the csvs over several given months 
#and returns a dictionnary of dfs, one per selected variables (except the timeindex) 
def scanSeveralMonthsSummary(platform, listMonths, listVariablesToSelect, timeindex, drophours: bool): 
    def scanDaySummary(platform, year, month, day, listVariablesToSelect, timeindex, drophours: bool) -> pl.DataFrame:    
        day = str(day).zfill(2)     #converts day(int) as day(string) with two numbers (01 instead of 1)
        filename = f"/Volumes/Paresseux/{platform}/sor-{platform.lower()}-{year}-{month}-{day}-full.zip"
        mother = zipfile.ZipFile(filename)
        dfs = [
            pl.scan_csv(child.open(csvfilename)).select(listVariablesToSelect).collect()
            for childname in mother.namelist()
            for child in [zipfile.ZipFile(mother.open(childname))]
            for csvfilename in child.namelist()
            ]
        if drophours==True:
            return dropHours(convertStrToDatetime(pl.concat(dfs), timeindex), timeindex)
        else:
            return convertStrToDatetime(pl.concat(dfs), timeindex)
    
    ##test to be quicker
    def scanMonthSummary(platform, year, month, firstDayMonth, lastDayMonth, listVariablesToSelect: list, timeindex, drophours: bool):   
        listvariables=listVariablesToSelect.copy()
        listvariables.remove(timeindex)

        dict_df={}
        for variable in listvariables:
            dict_df[variable]=pl.DataFrame()
        
        for day in range(firstDayMonth, lastDayMonth+1):
            df_day=scanDaySummary(platform, year, month, day, listVariablesToSelect, timeindex, drophours)
            for variable in listvariables:
                dict_df[variable]=pl.concat([dict_df[variable], df_day.group_by([timeindex, variable]).len().sort(by=timeindex)])
        return dict_df
    
    listvariables=listVariablesToSelect.copy()
    listvariables.remove(timeindex)
    dict_df={}
    for variable in listvariables:
        dict_df[variable]=pl.DataFrame()
    for month in listMonths:
        print(f"{month} running")
        dic_month=scanMonthSummary(platform, month[1], month[2], month[3], month[4], listVariablesToSelect, timeindex, drophours)
        for variable in listvariables:
            dict_df[variable]=pl.concat([dict_df[variable], dic_month[variable]])
    return dict_df

#this function takes a dictionnary of dfs, and returns a cleaned table counting the modalities of selectedVariable per day in timeindex, 
#with the additional columns of a daily sum and the associated percentage
def formatDict(dict_df, selectedVariable, timeindex):   
    df=dict_df[selectedVariable].group_by([timeindex, selectedVariable]).sum().sort(by=timeindex)
    df_sum=df.group_by(timeindex).sum().with_columns(pl.col("len").alias("daily_sum")).drop("len")  #create a daily_sum column
    df_final=df.join(df_sum, on=timeindex, how="left") #adds the daily_sum to each row
    df_final=df_final.with_columns((pl.col("len")/pl.col("daily_sum")).alias("%_sum"))  #create the %_sum column
    return df_final

In [None]:
#choose either Facebook or Instagram as data

#FACEBOOK DATA
dict_df_decisions= scanSeveralMonthsSummary("Facebook", listDetailedMonthsAprSept, 
                                            ["application_date", "decision_visibility", "decision_account"],
                                            "application_date", drophours=True)

#INSTAGRAM DATA
dict_df_decisions= scanSeveralMonthsSummary("Instagram", listDetailedMonthsAprSept, 
                                            ["application_date", "decision_visibility", "decision_account"],
                                            "application_date", drophours=True)

df_vis=formatDict(dict_df_decisions, "decision_visibility", "application_date")
df_acc=formatDict(dict_df_decisions, "decision_account", "application_date")

In [None]:
#the volume can be read in the "len" column

(df_vis
 .filter((pl.col("application_date")>=datetime(2024, 4, 1)).and_(pl.col("application_date")<=datetime(2024, 9, 30)))
 .filter(pl.col("decision_visibility")=="[\"DECISION_VISIBILITY_CONTENT_DEMOTED\"]").sum()
)

(df_vis
 .filter((pl.col("application_date")>=datetime(2024, 4, 1)).and_(pl.col("application_date")<=datetime(2024, 9, 30)))
 .filter(pl.col("decision_visibility")=="[\"DECISION_VISIBILITY_CONTENT_REMOVED\"]").sum()
)

(df_acc
 .filter((pl.col("application_date")>=datetime(2024, 4, 1)).and_(pl.col("application_date")<=datetime(2024, 9, 30)))
 .filter(pl.col("decision_account")=="DECISION_ACCOUNT_TERMINATED").sum()
)