## 1. Download commercial contents targeting Romania before the 2024 election

In [None]:
! minet tiktok scrape-commercials --country RO --min-date 20241106 --max-date 20241207 > romania-20241106-20241206.csv

The command above, using [**minet**](https://github.com/medialab/minet) Command Line Interface (CLI) is roughly equivalent to the Python code below, except that the CLI deals with retries in case of connection error.

In [None]:
from minet.tiktok import TiktokAPIScraper
from minet.tiktok.types import TiktokCommercialContent
from tqdm import tqdm
import csv

client = TiktokAPIScraper()
generator = client.search_commercial_contents(
    country="RO", min_date="20241106", max_date="20241207"
)
with open("romania-20241106-20241206-2.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=TiktokCommercialContent.fieldnames(), escapechar='"')
    writer.writeheader()
    for row in tqdm(generator, total=11896):
        writer.writerow(row.as_csv_dict_row())

## 2. Fetch cover images associated to each TikTok commercial content

In [None]:
! minet fetch -i romania-20241106-20241206.csv video_cover_image_urls  --folder-strategy prefix-4 --filename-column id --total 11896 --resume --domain-parallelism 4 --throttle 0 -o romania-20241106-20241206-fetch-report.csv

## 3. Run Optical Character Recognition (OCR) on images

In [None]:
import easyocr
import casanova
import os
from tqdm import tqdm

In [None]:
reader = easyocr.Reader(["ro"])

with open("romania-20241106-20241206-fetch-report.csv", "r") as fetch_report, open("romania-20241106-20241206-ocr.csv", "w") as output:
    enricher = casanova.enricher(fetch_report, output, add=["ocr"])

    path_position = enricher.headers.path
    for row in tqdm(enricher, total=11896):
        if row[path_position]:
            result = reader.readtext(os.path.join("downloaded", row[path_position]))
            full_text = " ".join(text[1] for text in result)
            enricher.writerow(row, [full_text])
        else:
            enricher.writerow(row, [""])

## 4. Plot number of images where some text was detected

In [None]:
import pandas as pd
import altair as alt

In [None]:
ocr_data = pd.read_csv("romania-20241106-20241206-ocr.csv")
ocr_data["create_date"].min()

In [None]:
ocr_data["date"] = pd.to_datetime(ocr_data["create_date"]).dt.strftime('%Y-%m-%d')

In [None]:
ocr_data = ocr_data.loc[ocr_data.date > '2024-11-05']

In [None]:
ocr_data["create_date"].min()

In [None]:
ocr_data["text_detected"] = ocr_data.ocr.notna()

In [None]:
aggregation_lower = ocr_data.groupby(["date", "text_detected"]).count()["id"].reset_index().rename(columns={"id": "count"})

In [None]:
text_detection_chart = alt.Chart(aggregation_lower).mark_bar().encode(
    x=alt.X('monthdate(date):O', title=''),
    y=alt.Y('sum(count)', title='Number of TikTok images'),
    #tooltip='count:Q',
    color=alt.Color("text_detected:O", legend=alt.Legend(title='Text in image')),
    #row=alt.Row("candidate:N", title=''),
).properties(height=150, width=500)

In [None]:
text_detection_chart

## 5. Look for names of candidates for the 2024 presidential election

In [None]:
candidates = ["Călin Georgescu", "Elena Lasconi", "Marcel Ciolacu", "George Simion", "Nicolae Ciucă", ]

In [None]:
def normalize_name(name):
    split = name.lower().split(" ")
    if len(split) > 1:
        return split[1]
    return split[0]

In [None]:
normalize_name(candidates[1])

Search for the last name of each candidate. If several names are mentionned, the post is not counted.

In [None]:
ocr_found = ocr_data.loc[ocr_data.ocr.notna()].copy()
ocr_found["candidate"] = ""
for candidate in candidates:
    candidate_found = ocr_found.loc[ocr_found.ocr.str.lower().str.contains(normalize_name(candidate))]
    for other_candidate in candidates:
        if other_candidate != candidate:
            candidate_found = candidate_found.loc[~candidate_found.ocr.str.lower().str.contains(normalize_name(other_candidate))]
    ocr_found.loc[ocr_found.id.isin(candidate_found.id.unique()), "candidate"] = candidate

In [None]:
ocr_found.loc[ocr_found.candidate != ""].to_csv("romania-20241106-20241206-candidates.csv", index=False)

In [None]:
aggregation_upper = ocr_found.groupby(["date", "candidate"]).count()["id"].reset_index().rename(columns={"id": "count"})

In [None]:
missing_rows = aggregation_upper.copy()
missing_rows["candidate"] = "Nicolae Ciucă"
missing_rows["count"] = 0
aggregation_upper = pd.concat([missing_rows, aggregation_upper])

## 6. Plot mentions of candidates in TikTok images

In [None]:
palette = ['indigo', 'coral', "palegreen", "gold", 'cadetblue']

In [None]:

candidates_chart = alt.Chart(aggregation_upper).transform_filter((alt.datum.candidate != "")).mark_bar().encode(
    x=alt.X('monthdate(date):O', title=''),
    y=alt.Y('count:Q', title='', scale=alt.Scale(domain=[0, 10])),
    color=alt.Color("candidate:N", legend=alt.Legend()).scale(range=palette),
    #tooltip='count:Q',
    row=alt.Row("candidate:N", title='Number of TikTok images mentioning one candidate').sort(candidates),
).properties(height=60, width=500)

In [None]:
candidates_chart

## 7. Plot both charts

In [81]:
alt.vconcat(
    candidates_chart, 
    text_detection_chart,   
    resolve = alt.Resolve(scale=alt.LegendResolveMap(color=alt.ResolveMode('independent')))
)