# Digital Methods - Data Collection
_____

**Exam numbers:**

**Date:** 09.05.2024

_____

## Table of Content

1. [Libraries](#libraries)
2. [Youtube Search API](#youtube-search-api)
3. [Filter Videos by Keywords](#filter-videos-by-keyword)
3. [Youtube API](#youtube-api)

_____

## Libraries

all libraries which are needed to execute the code are listed here, if anything is not installed use `!pip install` to get the packages.

In [None]:
# import packages
import googleapiclient.discovery
import pandas as pd
from googleapiclient.discovery import build
import os  

#import script
from api_key import api_key_1, api_key_2, api_key_3, api_key_4

In [None]:
# credentials for your Youtube API

#-----enter here-------
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "YOUR_API_KEY"

In [None]:
# list of channels
channel_ids = ['UC3e7Z56naX4KTrs9l7nSyWw', 'UCKgJEs_v0JB-6jWb8lIy9Xw', 'UC4T4vA6MTWS2QE2C6o8Sevw',
               'UCL_f53ZEJxp8TtlOkHwMV9Q', 'UCprclkVrNPls7PR-nHhf1Ow', 'UCfZS_wFmJCXqPr4MYtAIN6w',
               'UCJIfeSCssxSC_Dhc5s7woww', 'UCx6h-dWzJ5NpAlja1YsApdg', 'UCtdbWsnfA08KhSUO4amVLaQ',
               'UCzQUP1qoWDoEbmsQxvdjxgQ', 'UCZWlSUNDvCCS1hBiXV0zKcA', 'UC0uVZd8N7FfIZnPu0y7o95A',
               'UCx8NKvG7RPO3FDL_hvl9Aaw', 'UCoJhK5kMc4LjBKdiYrDtzlA', 'UCPsCJ1j0G45FnRGqJhCHLiA']

In [None]:
# Initialize the YouTube API client
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=DEVELOPER_KEY)

## Youtube Search API

You can find the documentation for the Youtube Data API here: https://developers.google.com/youtube/v3

In [None]:
# List of channel IDs you want to search on
channel_ids = ['UCL_f53ZEJxp8TtlOkHwMV9Q']

# List of query terms
queries = ['climate', 'climate change', 'climate crisis', 'climate emergency',
           'greenhouse', 'greenhouse gas', 'green energy', 'environment',
           'extreme weather', 'heat', 'cooling', 'hurricane', 'floods', 'ice age',
           'wildfire', 'drought', 'temperature', 'warming', 'greta', 'greta thunberg',
           'fossil fuel', 'energy', 'renewable', 'fracking', 'co2', 'carbon dioxide',
           'pollution', 'net emissions', 'net zero', 'alarmism', 'biodiversity', 
           'extinction', 'antartica', 'greenland', 'galciers', 'artic', '97 percent', 'cop',
           'natural cycles', 'sea level', 'climate lockdown']

# Create an empty list to store video information
video_data = []

# Iterate over each channel ID
for channel_id in channel_ids:
    # Iterate over each query term
    for query in queries:
        # Make the search request
        search_response = youtube.search().list(
            q=query,
            channelId=channel_id,
            type='video',
            part='id,snippet',
            maxResults=50,
            publishedAfter='2023-01-01T00:00:00Z'  # Ensure videos are published after January 1, 2023
        ).execute()

        # Parse the response and append video information to the list
        for search_result in search_response.get('items', []):
            if search_result['id']['kind'] == 'youtube#video':
                video_id = search_result['id']['videoId']
                video_response = youtube.videos().list(
                    part="snippet,statistics",
                    id=video_id
                ).execute()
                video_description = video_response['items'][0]['snippet']['description'] 
                video_title = search_result['snippet']['title']
                published_at = search_result['snippet']['publishedAt']
                channel_name = search_result['snippet']['channelTitle']
                comment_count = video_response['items'][0]['statistics'].get('commentCount', 0)
                like_count = video_response['items'][0]['statistics'].get('likeCount', 0)
                category = video_response['items'][0]['snippet'].get('categoryId', 'Not available')

                video_data.append({
                    "title": video_title,
                    "video_id": video_id,
                    "published_at": published_at,
                    "channel_name": channel_name,
                    "description": video_description,
                    "comment_count": comment_count,
                    "like_count": like_count,
                    "category": category
                })

# Create a DataFrame from the list of video information
df = pd.DataFrame(video_data)


In [None]:
# Remove duplicates from the 'video_id' column
df.drop_duplicates(subset='video_id', inplace=True)

In [None]:
# safe csv file  
df.to_csv('')  

## Filter videos by keyword

This section is filtering the videos with a second query, since the first query is based on the youtube algorithm and therefore just a first step to organise the videos based on the topic of interest.

In [None]:
df = pd.read_csv('')

In [None]:
# Define reference list of words to check for
reference_words = [
    "climate", "climate bill", "climate schemes", "climate emergency hoax", "climate policies", "climate legislation",
    "climate protesters", "climate misconception", "climate change debate", "climate catastrophists",
    "climate wars", "climate realist", "climate change panic", "climate hypocrite", "climate hysteria", "climate fundamentalism",
    "climate change religion", "climate questions", "climate activists", "climate change myths", "climate truths",
    "climate change discussion", "climate agenda", "climate madness", "climate change victims", "climate alamists",
    "climate cult", "climate bills", "climate theology", "climate insanity", "climate science", "climate change narratives",
    "climate campaign", "climate globalists", "climate warrior", "climate 'experts'", "climate change agenda",
    "climate ideology", "climate catastrophe", "climate hysterics", "climate impact", "green hypocrisy", "green policies",
    "environmental vandals", "net zero nonsense", "net zero", "green propaganda", "greenflation", "green apocalypse",
    "doomdsday propaganda", "environmental movement", "green boondoggles", "radical climatiers", "green hysteria",
    "heatwave hysteria", "climate crisis", "climate reparations",
    "global boiling", "greta's doomsday", "eco-fascism", "greta thumberg clownshow", "globalist lie climate lockdowns",
    "cop26", "green economy", "the end of snow", "sea level check", "tipping point", "denialism", "hurricanes", "eco-colonialism", "glaciers", "alarmism", "climate alarmists",
    "climate debate", "extinction rebellion", "climate science", "energy catastrophe", "climate fear", "climate cult", 
    "environmentalist", "carbon tax", "clean energy regulations",
    "wild fires", "emissions reduction plan", "climate fantasy", "electric cars", "low carbon diet", "heat waves",
    "clean environment", "save the planet", "climate emergency", "global boiling",
    "climate activists", "climate executive order", "religion of climate change", "wildire", "climate disaster",
    "climate change confessionals", "save the environment", "global warming", "'inflation reduction act'",
    "solar panel", "liquid natural gas ban", "environmental justice", "climate doom-mongers",
    "climate apocalypse", "climate lockdown", "global boiling", "net zero con", "green movement", "green crackdown",
    "green war", "environmentalism", "green dogma", "'climate crisis'", "greta thunberg",
    "climate science", "climate con", "save the planet", "climate collusion", "climate apocalypse",
    "climate change apocalypse", "natural climate change", "climate realist", "climate hysteria", "greenhouse gas",
    "climate change panic", "climate change alarmism", "co2", "climate credibility", "climate collapse", "climate tyranny",
    "climate claims", "climate amnesia", "climate change controversy", "climate reality", "climate change effects",
    "climate change protestors", "climate change agenda", "climate cult", "rising sea levels",
    "climate change anxiety", "climate summit", "green police", "unprecedented weather conditions", "climate reparations",
    "climate narratives", "energy markets", "unreliable renewables", "renewable energy", "real environmentalism",
    "climate change alarmism", "hurricanes worse", "climate change activists",
    "religion of green", "climate activist", "global warming", "fossil fuels", "climate move", "climate terrorists",
    "climate lie", "great reset", "climate hypocrisy", "eco terrorism", "cloud seeding", "climate control", "green energy"
]


# Function to check if any reference word appears in the text
def contains_reference(text):
    for word in reference_words:
        if word in text:
            return True
    return False

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Convert 'Title' and 'Description' columns to lowercase
    title_lower = row['title'].lower()
    description_lower = row['description'].lower()

    # Check if any of the reference words appear in either column
    if contains_reference(title_lower) or contains_reference(description_lower):
        # Word found, keep the row
        pass
    else:
        # No matching words found, drop the row
        df.drop(index, inplace=True)

# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)


In [None]:
df['comment_count'] = df['comment_count'].astype(int)


In [None]:
# get sum of comment
sum(df['comment_count'])

In [None]:
# Fetch video categories
categories_response = youtube.videoCategories().list(
    part="snippet",
    regionCode="US" 
).execute()

# Create a dictionary to map category IDs to category names
category_map = {}
for category in categories_response['items']:
    category_map[category['id']] = category['snippet']['title']

# Now, loop over category column in your DataFrame
for index, row in df.iterrows():
    category_id = row['category']
    category_name = category_map.get(category_id, "Unknown")
    df.at[index, 'category'] = category_name

In [None]:
# safe csv file  
df.to_csv('')  

## Youtube API

Accessing the YouTube comments of each video which is related to the topic of climate change. Therefore we take the video_id from the created dataframe to access the comments for each video.

In [None]:
# load data frame with all related videos about climate change
df = pd.read_csv('', index_col=0)


In [None]:
# Initialize an empty list to store comments
all_comments_data = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    video_id = row['video_id']

    # Make a request to fetch comments for the video
    comments_response = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=730  # Adjust the number of comments per page as needed
    ).execute()

    # Iterate over each comment thread in the response
    for comment_thread in comments_response['items']:
        comment_snippet = comment_thread['snippet']['topLevelComment']['snippet']
        published_at = comment_snippet['publishedAt']
        like_count = comment_snippet['likeCount']
        comment_text = comment_snippet['textDisplay']
        author = comment_snippet['authorDisplayName']

        comment_data = {
            "video_id": video_id,
            "published_at": published_at,
            "like_count": like_count,
            "text": comment_text,
            "author": author
        }

        all_comments_data.append(comment_data)

# Create a DataFrame from the list of comments
comments_df = pd.DataFrame(all_comments_data)


In [None]:
# safe csv file  
comments_df.to_csv('')  