In [None]:
import requests
import pandas as pd
import deepl
from datetime import datetime, timedelta
import numpy as np
from google.colab import files
from tqdm.notebook import tqdm
import time
tqdm.pandas()

In [None]:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
# API Key was removed

In [None]:
youtube = build("youtube", "v3", developerKey=api_key)

In [None]:
CHANNEL_NAMES = ["@RPPNoticias"]


# Defining function that fetches channel IDs
def get_channel_ids(api_key, channel_names):

    base_url = "https://www.googleapis.com/youtube/v3/search"
    results = {}

    for name in channel_names:
        params = {
            "part": "snippet",
            "q": name,
            "type": "channel",
            "key": api_key
        }

        response = requests.get(base_url, params=params).json()

        if "items" in response and response["items"]:
            channel_id = response["items"][0]["id"]["channelId"]
            results[name] = channel_id
        else:
            results[name] = "Not Found"

    return results



# Fetch Channel IDs
channel_ids = get_channel_ids(api_key, CHANNEL_NAMES)

for name, channel_id in channel_ids.items():
    print(f"{name} - Channel ID: {channel_id}")


@RPPNoticias - Channel ID: UC5j8-2FT0ZMMBkmK72R4aeA


# Collecting videos for RPP Noticias


In [None]:
#Dataframe with full month periods

# Define the start and end dates
start_date = datetime(2019, 2, 1)
end_date = datetime(2025, 3, 1)

# Create a list to hold the data
data = []

# Generate the start and end date-times for each full month period
current_date = start_date
while current_date < end_date:
    # Get the start date of the current month
    start_date_str = current_date.strftime("%Y-%m-%dT%H:%M:%S-05:00")

    # Calculate the next month's first day for the end date
    next_month = current_date.replace(day=28) + timedelta(days=4)  # this gives us the next month
    end_date_str = next_month.replace(day=1) - timedelta(seconds=1)
    end_date_str = end_date_str.strftime("%Y-%m-%dT%H:%M:%S-05:00")

    # Append to the data list
    data.append([start_date_str, end_date_str])

    # Move to the first day of the next month
    current_date = next_month.replace(day=1)


date_ranges = pd.DataFrame(data, columns=["start", "end"])

date_ranges

Unnamed: 0,start,end
0,2019-02-01T00:00:00-05:00,2019-02-28T23:59:59-05:00
1,2019-03-01T00:00:00-05:00,2019-03-31T23:59:59-05:00
2,2019-04-01T00:00:00-05:00,2019-04-30T23:59:59-05:00
3,2019-05-01T00:00:00-05:00,2019-05-31T23:59:59-05:00
4,2019-06-01T00:00:00-05:00,2019-06-30T23:59:59-05:00
...,...,...
68,2024-10-01T00:00:00-05:00,2024-10-31T23:59:59-05:00
69,2024-11-01T00:00:00-05:00,2024-11-30T23:59:59-05:00
70,2024-12-01T00:00:00-05:00,2024-12-31T23:59:59-05:00
71,2025-01-01T00:00:00-05:00,2025-01-31T23:59:59-05:00


In [None]:
# new function
#Function for getting video info for one channel id, one keyword, one time frame
# Returns dictionary of all videos found and number of videos, and prints number
#

def get_youtube_videos(api_key, channel_id, keyword, max_results=50, order="date",
                       published_after=None, published_before=None):

    videos = []

    # Prepare API request parameters
    params = {
        "part": "snippet",
        "channelId": channel_id,
        "q": keyword,
        "type": "video",
        "maxResults": max_results,
        "order": order,
    }

    if published_after:
        params["publishedAfter"] = published_after

    if published_before:
        params["publishedBefore"] = published_before

    # Execute API request
    request = youtube.search().list(**params)
    response = request.execute()

    # Extract video details
    for item in response.get("items", []):
        video_id = item["id"]["videoId"]
        video_title = item["snippet"]["title"]
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        video_date = item["snippet"]["publishedAt"]

        videos.append({
            "video_id": video_id,
            "title": video_title,
            "url": video_url,
            "channel": channel_id,
            "keyword": keyword,
            "video_date": video_date})


    print("Number of videos found between " + published_after + " and " + published_before + ": " + str(len(videos)))



    return videos, len(videos)

In [None]:
# applying new function
# Create a for loop that goes through each time frame
    # gets all youtube videos and store them in a dataframe

channel_id = "UC5j8-2FT0ZMMBkmK72R4aeA" # RPP Noticias
max_results = 50
order = "date"
keyword = "robo|robó|roban|robaron|delincuentes|criminales|asalto|asaltó|asaltan|asaltaron|matan|mató|mataron|ladrón|asesinato|asesinado|asesinan|asesinaron|extorsionadores|extorsionador|extorsionan|extorsionaron|extorsionó|secuestro|secuestradores|secuestrada|secuestrado|secuestró|secuestraron|balacera|balean|balearon|balazos|sicariato|mafias|mafia|sicarios"

# Create empty dataframe for first time running it
df = pd.DataFrame(columns=['video_id','title','url','channel','keyword','video_date'])
videos_list = []
videos_count_list = []

for index, row in date_ranges.iterrows():
    published_after = row['start']
    published_before = row['end']


    all = get_youtube_videos(api_key, channel_id, keyword, max_results,
                                                          order, published_after, published_before)
    videos = all[0]
    video_count = all[1]
    videos_list.append(videos)
    videos_count_list.append(video_count)


Number of videos found between 2019-02-01T00:00:00-05:00 and 2019-02-28T23:59:59-05:00: 22
Number of videos found between 2019-03-01T00:00:00-05:00 and 2019-03-31T23:59:59-05:00: 15
Number of videos found between 2019-04-01T00:00:00-05:00 and 2019-04-30T23:59:59-05:00: 10
Number of videos found between 2019-05-01T00:00:00-05:00 and 2019-05-31T23:59:59-05:00: 32
Number of videos found between 2019-06-01T00:00:00-05:00 and 2019-06-30T23:59:59-05:00: 22
Number of videos found between 2019-07-01T00:00:00-05:00 and 2019-07-31T23:59:59-05:00: 16
Number of videos found between 2019-08-01T00:00:00-05:00 and 2019-08-31T23:59:59-05:00: 19
Number of videos found between 2019-09-01T00:00:00-05:00 and 2019-09-30T23:59:59-05:00: 17
Number of videos found between 2019-10-01T00:00:00-05:00 and 2019-10-31T23:59:59-05:00: 19
Number of videos found between 2019-11-01T00:00:00-05:00 and 2019-11-30T23:59:59-05:00: 14
Number of videos found between 2019-12-01T00:00:00-05:00 and 2019-12-31T23:59:59-05:00: 32

In [None]:
flattened_list = np.concatenate(videos_list).tolist()
df_R_1 = pd.DataFrame(flattened_list)
df_R_1.drop(columns=['keyword'], inplace=True)
date_ranges['videos_count'] = videos_count_list

In [None]:
df_R_1

In [None]:
#splitting to weekly periods
split_periods = []

df_filtered = date_ranges[date_ranges["videos_count"] == 50].copy()


# Convert to datetime format
df_filtered["start"] = pd.to_datetime(df_filtered["start"])
df_filtered["end"] = pd.to_datetime(df_filtered["end"])

# Create a new list to store weekly periods
weekly_periods = []

# Loop through each full-month period and split into weeks
for _, row in df_filtered.iterrows():
    start_date = row["start"]
    end_date = row["end"]

    current_date = start_date

    while current_date < end_date:
        next_week = current_date + timedelta(days=7)
        if next_week > end_date:
            next_week = end_date  # Ensure the last week aligns with the month's end

        weekly_periods.append({
            "start": current_date,
            "end": next_week - timedelta(seconds=1)  # End at 23:59:59 of the previous day
        })

        current_date = next_week  # Move to the next week

# Create new DataFrame for weekly periods
week_ranges = pd.DataFrame(weekly_periods)

# Convert datetime format back to string
week_ranges["start"] = week_ranges["start"].dt.strftime("%Y-%m-%dT%H:%M:%S-05:00")
week_ranges["end"] = week_ranges["end"].dt.strftime("%Y-%m-%dT%H:%M:%S-05:00")

# Display the result
print(week_ranges)

                        start                        end
0   2020-01-01T00:00:00-05:00  2020-01-07T23:59:59-05:00
1   2020-01-08T00:00:00-05:00  2020-01-14T23:59:59-05:00
2   2020-01-15T00:00:00-05:00  2020-01-21T23:59:59-05:00
3   2020-01-22T00:00:00-05:00  2020-01-28T23:59:59-05:00
4   2020-01-29T00:00:00-05:00  2020-01-31T23:59:58-05:00
5   2023-10-01T00:00:00-05:00  2023-10-07T23:59:59-05:00
6   2023-10-08T00:00:00-05:00  2023-10-14T23:59:59-05:00
7   2023-10-15T00:00:00-05:00  2023-10-21T23:59:59-05:00
8   2023-10-22T00:00:00-05:00  2023-10-28T23:59:59-05:00
9   2023-10-29T00:00:00-05:00  2023-10-31T23:59:58-05:00
10  2024-01-01T00:00:00-05:00  2024-01-07T23:59:59-05:00
11  2024-01-08T00:00:00-05:00  2024-01-14T23:59:59-05:00
12  2024-01-15T00:00:00-05:00  2024-01-21T23:59:59-05:00
13  2024-01-22T00:00:00-05:00  2024-01-28T23:59:59-05:00
14  2024-01-29T00:00:00-05:00  2024-01-31T23:59:58-05:00


In [None]:
#Going through week_ranges
# Create a for loop that goes through each time frame
    # gets all youtube videos and store them in a dataframe

channel_id = "UC5j8-2FT0ZMMBkmK72R4aeA" # RPP Noticias
max_results = 50
order = "date"
keyword = "robo|robó|roban|robaron|delincuentes|criminales|asalto|asaltó|asaltan|asaltaron|matan|mató|mataron|ladrón|asesinato|asesinado|asesinan|asesinaron|extorsionadores|extorsionador|extorsionan|extorsionaron|extorsionó|secuestro|secuestradores|secuestrada|secuestrado|secuestró|secuestraron|balacera|balean|balearon|balazos|sicariato|mafias|mafia|sicarios"

# Create empty dataframe for first time running it
df = pd.DataFrame(columns=['video_id','title','url','channel','keyword','video_date'])
videos_list = []
videos_count_list = []

for index, row in week_ranges.iterrows():
    published_after = row['start']
    published_before = row['end']


    all = get_youtube_videos(api_key, channel_id, keyword, max_results,
                                                          order, published_after, published_before)
    videos = all[0]
    video_count = all[1]
    videos_list.append(videos)
    videos_count_list.append(video_count)

Number of videos found between 2020-01-01T00:00:00-05:00 and 2020-01-07T23:59:59-05:00: 11
Number of videos found between 2020-01-08T00:00:00-05:00 and 2020-01-14T23:59:59-05:00: 16
Number of videos found between 2020-01-15T00:00:00-05:00 and 2020-01-21T23:59:59-05:00: 7
Number of videos found between 2020-01-22T00:00:00-05:00 and 2020-01-28T23:59:59-05:00: 18
Number of videos found between 2020-01-29T00:00:00-05:00 and 2020-01-31T23:59:58-05:00: 3
Number of videos found between 2023-10-01T00:00:00-05:00 and 2023-10-07T23:59:59-05:00: 7
Number of videos found between 2023-10-08T00:00:00-05:00 and 2023-10-14T23:59:59-05:00: 21
Number of videos found between 2023-10-15T00:00:00-05:00 and 2023-10-21T23:59:59-05:00: 11
Number of videos found between 2023-10-22T00:00:00-05:00 and 2023-10-28T23:59:59-05:00: 7
Number of videos found between 2023-10-29T00:00:00-05:00 and 2023-10-31T23:59:58-05:00: 5
Number of videos found between 2024-01-01T00:00:00-05:00 and 2024-01-07T23:59:59-05:00: 5
Numbe

In [None]:
flattened_list = np.concatenate(videos_list).tolist()
df_R_2 = pd.DataFrame(flattened_list)
df_R_2.drop(columns=['keyword'], inplace=True)

week_ranges['videos_count'] = videos_count_list

In [None]:
df_merged = pd.concat([df_R_2, df_R_1], ignore_index=True, sort=False)

In [None]:
df_merged = df_merged.drop_duplicates(subset="video_id", keep="first")
df_merged

In [None]:
df_merged.to_csv("videos_titles_RPPNoticias.csv")

# Transcripts

In [None]:
# trnascript function that stores also the error messages

def get_transcript(video_id, language='es'):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        return " ".join([line['text'] for line in transcript])
    except Exception as e:
        return f"Error: {e}"

In [None]:
# Attempt to retrieve transcripts for first 1000 failed cases

for index, row in tqdm(df_merged.iterrows(), total=1500, desc="Retrying transcripts"):
    time.sleep(2)
    df_merged.at[index, 'transcript'] = get_transcript(row['video_id'])


In [None]:
df_merged

In [None]:
df_merged.to_csv("transcripts_RPPNoticias.csv")

In [None]:
df_merged = pd.read_csv("transcripts_RPPNoticias.csv")

In [None]:
error_count = df_merged[df_merged['transcript'].str.startswith("Error", na=False)].shape[0]
print(f"Total transcripts with errors: {error_count}")

Total transcripts with errors: 100


In [None]:
error_count = df_merged[df_merged['transcript'].str.contains('age-restricted', case=False, na=False)].shape[0]
print(f"Total transcripts with age-restricted errors: {error_count}")
error_count = df_merged[df_merged['transcript'].str.contains('Subtitles are disabled', case=False, na=False)].shape[0]
print(f"Total transcripts with subtitles are disabled errors: {error_count}")
error_count = df_merged[df_merged['transcript'].str.contains('video is unplayable', case=False, na=False)].shape[0]
print(f"Total transcripts with video is unplayable errors: {error_count}")
error_count = df_merged[df_merged['transcript'].str.contains('requested language codes:', case=False, na=False)].shape[0]
print(f"Total transcripts with requested language codes: ['es'] errors: {error_count}")

Total transcripts with age-restricted errors: 0
Total transcripts with subtitles are disabled errors: 97
Total transcripts with video is unplayable errors: 0
Total transcripts with requested language codes: ['es'] errors: 3
