In [1]:
import os
import pandas as pd
import re
from collections import defaultdict

# Load all the data once and process it to avoid redundancy
processed_dir = 'processed'
pklfiles = [f for f in os.listdir(processed_dir) if f.endswith('.pkl')]

# Define channel identification rules
channel_keywords = {
    "CNN": r"cnn|imagenscnn",  # Using word boundary to match 'cnn' exactly, or the specific term
    "RTP3": r"=3",
    "SIC Noticias": r"sic|debates",     # Matching 'sic' as a whole word or 'debates'
    "TVI": r"frenteafrente|tvi|decisao24|decisao|@vi|deusio24"  # Matching the exact phrase or 'tvi'
}

def identify_channel(data, thresholds):
    channel_counts = defaultdict(int)

    for framenumber in range(len(data)):
        texts = data.iloc[framenumber]['text']
        for text in texts:
            text_cleaned = text.replace(" ", "").lower()

            for channel, pattern in channel_keywords.items():
                if re.search(pattern, text_cleaned):
                    channel_counts[channel] += 1
                    if channel_counts[channel] > thresholds[channel]:
                        return channel

    # Determine which channel has the highest count if no single channel surpasses its threshold
    if channel_counts:
        max_channel = max(channel_counts, key=channel_counts.get)
        if channel_counts[max_channel] > 0:
            return max_channel
    return "SIC Noticias"

# Thresholds for channel identification, determined empirically or via a heuristic
thresholds = {
    "CNN": 200,
    "RTP3": 500,
    "SIC Noticias": 50,
    "TVI": 50
}

# Analyze each video file
for file in pklfiles:
    data_path = os.path.join(processed_dir, file)
    data = pd.read_pickle(data_path)    
    channel = identify_channel(data, thresholds)
    print(f"Video {file}: {channel}")


Video ad-be.pkl: SIC Noticias
Video ad-cdu.pkl: RTP3
Video ad-chega.pkl: RTP3
Video ad-il.pkl: SIC Noticias
Video ad-livre.pkl: TVI
Video ad-pan.pkl: SIC Noticias
Video ad-ps.pkl: SIC Noticias
Video be-ps.pkl: RTP3
Video cdu-be.pkl: SIC Noticias
Video cdu-ps.pkl: SIC Noticias
Video chega-be.pkl: RTP3
Video chega-cdu.pkl: TVI
Video chega-il.pkl: SIC Noticias
Video chega-ps.pkl: TVI
Video il-be.pkl: CNN
Video il-cdu.pkl: RTP3
Video il-ps.pkl: SIC Noticias
Video livre-be.pkl: SIC Noticias
Video livre-cdu.pkl: CNN
Video livre-chega.pkl: SIC Noticias
Video livre-il.pkl: TVI
Video livre-ps.pkl: RTP3
Video pan-be.pkl: TVI
Video pan-cdu.pkl: RTP3
Video pan-chega.pkl: RTP3
Video pan-il.pkl: SIC Noticias
Video pan-livre.pkl: RTP3
Video pan-ps.pkl: TVI
