# Digital Methods - Data Collection
_____

## Table of Content

1. [Libraries](#libraries)
2. [Youtube Search API](#youtube-search-api)
3. [Filter Videos by Keywords](#filter-videos-by-keyword)
3. [Youtube API](#youtube-api)

_____

## Libraries

All libraries which are needed to execute the code are listed here. Install the packages by using the `requirements.txt` file. 

The documentation can be found in the [README.md](README.md) file.

In [52]:
# import packages
import googleapiclient.discovery
import pandas as pd
from googleapiclient.discovery import build
import os  

#import script --- CREATING SCRIPT TO IMPORT API KEY ---
from api_key import api_key_1

In [53]:
# credentials for your Youtube API

#-----enter here-------
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = api_key_1

In [247]:
# define paths
'''INSERT YOUR PATH HERE TO SAVE YOUR FILES'''
directory_path = ''
directory_path_new_file = ''
comment_file = ''

In [None]:
# list of channels we looked at
channel_ids = ['UC3e7Z56naX4KTrs9l7nSyWw', 'UCKgJEs_v0JB-6jWb8lIy9Xw', 'UC4T4vA6MTWS2QE2C6o8Sevw',
               'UCL_f53ZEJxp8TtlOkHwMV9Q', 'UCprclkVrNPls7PR-nHhf1Ow', 'UCfZS_wFmJCXqPr4MYtAIN6w',
               'UCJIfeSCssxSC_Dhc5s7woww', 'UCx6h-dWzJ5NpAlja1YsApdg', 'UCtdbWsnfA08KhSUO4amVLaQ',
               'UCzQUP1qoWDoEbmsQxvdjxgQ', 'UCZWlSUNDvCCS1hBiXV0zKcA', 'UC0uVZd8N7FfIZnPu0y7o95A',
               'UCoJhK5kMc4LjBKdiYrDtzlA', 'UCPsCJ1j0G45FnRGqJhCHLiA']

In [36]:
# Initialize the YouTube API client
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=DEVELOPER_KEY)

## Youtube Search API

You can find the documentation for the Youtube Data API here: https://developers.google.com/youtube/v3

In [37]:
# List of channel IDs you want to search on
channel_ids = ['']

# List of query terms
queries = ['climate', 'climate change', 'climate crisis', 'climate emergency',
           'greenhouse', 'greenhouse gas', 'green energy', 'environment',
           'extreme weather', 'heat', 'cooling', 'hurricane', 'floods', 'ice age',
           'wildfire', 'drought', 'temperature', 'warming', 'greta', 'greta thunberg',
           'fossil fuel', 'energy', 'renewable', 'fracking', 'co2', 'carbon dioxide',
           'pollution', 'net emissions', 'net zero', 'alarmism', 'biodiversity', 
           'extinction', 'antartica', 'greenland', 'galciers', 'artic', '97 percent', 'cop',
           'natural cycles', 'sea level', 'climate lockdown']

# Create an empty list to store video information
video_data = []

# Iterate over each channel ID
for channel_id in channel_ids:
    # Iterate over each query term
    for query in queries:
        # Make the search request
        search_response = youtube.search().list(
            q=query,
            channelId=channel_id,
            type='video',
            part='id,snippet',
            maxResults=50,
            publishedAfter='2023-01-01T00:00:00Z'  # Ensure videos are published after January 1, 2023
        ).execute()

        # Parse the response and append video information to the list
        for search_result in search_response.get('items', []):
            if search_result['id']['kind'] == 'youtube#video':
                video_id = search_result['id']['videoId']
                video_response = youtube.videos().list(
                    part="snippet,statistics",
                    id=video_id
                ).execute()
                video_description = video_response['items'][0]['snippet']['description'] 
                video_title = search_result['snippet']['title']
                published_at = search_result['snippet']['publishedAt']
                channel_name = search_result['snippet']['channelTitle']
                comment_count = video_response['items'][0]['statistics'].get('commentCount', 0)
                like_count = video_response['items'][0]['statistics'].get('likeCount', 0)
                category = video_response['items'][0]['snippet'].get('categoryId', 'Not available')

                video_data.append({
                    "title": video_title,
                    "video_id": video_id,
                    "published_at": published_at,
                    "channel_name": channel_name,
                    "description": video_description,
                    "comment_count": comment_count,
                    "like_count": like_count,
                    "category": category
                })

# Create a DataFrame from the list of video information
df = pd.DataFrame(video_data)

In [38]:
# Remove duplicates from the 'video_id' column
df.drop_duplicates(subset='video_id', inplace=True)

In [57]:
# safe csv file  
df.to_csv(directory_path)  

## Filter videos by keyword

This section is filtering the videos with a second query, since the first query is based on the youtube algorithm and therefore just a first step to organise the videos based on the topic of interest.

In [231]:
df = pd.read_csv(directory_path, index_col=0)

In [233]:
# dataframe as int and str
df['comment_count'] = df['comment_count'].astype(int)
df['title'] = df['title'].astype(str)
df['description'] = df['description'].astype(str)

In [234]:
# Define reference list of words to check for
reference_words = [
    "climate", "green hypocrisy", "green policies", "green policy", "environmental", "net zero",
    "green propaganda", "greenflation", "green apocalypse", "green boondoggles", "green hysteria",
    "heatwave hysteria", "global boiling", "greta's doomsday", "gretas doomsday", "eco-fascism",
    "eco fascism", "greta thumberg", "cop26", "green economy", "the end of snow", "sea level check",
    "hurricanes", "eco-colonialism", "eco colonialism", "glaciers", "alarmism", "energy catastrophe",
    "carbon tax", "clean energy regulations", "wild fires", "wildfire", "emissions reduction plan",
    "low carbon diet", "heat waves", "clean environment", "save the planet", "global boiling",
    "save the environment", "global warming", "solar panel", "liquid natural gas ban",
    "environmental justice", "green movement", "green crackdown", "green dogma", "greta thunberg",
    "greenhouse gas", "CO2", "rising sea levels", "green police", "unprecedented weather conditions",
    "energy markets", "unreliable renewables", "renewable energy", "religion of green", "fossil fuel",
    "eco terrorism", "eco-terrorism", "cloud seeding", "green energy", "vandalizing famous paintings"
]

# Function to check if any reference word appears in the text
def contains_reference(text):
    for word in reference_words:
        if word in text:
            return True
    return False

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Convert 'Title' and 'Description' columns to lowercase
    title_lower = row['title'].lower()
    description_lower = row['description'].lower()

    # Check if any of the reference words appear in either column

        #---we changed to statement to AND if the Youtube Channel is a podcast channel
    if contains_reference(title_lower) or contains_reference(description_lower):
        # Word found, keep the row
        pass
    else:
        # No matching words found, drop the row
        df.drop(index, inplace=True)

# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)
df.drop_duplicates(subset='video_id', inplace=True)

In [235]:
# get sum of comment
sum(df['comment_count'])

1770

In [239]:
# Fetch video categories
categories_response = youtube.videoCategories().list(
    part="snippet",
    regionCode="US" 
).execute()

# Create a dictionary to map category IDs to category names
category_map = {}
for category in categories_response['items']:
    category_map[category['id']] = category['snippet']['title']

# Now, loop over category column in your DataFrame
for index, row in df.iterrows():
    category_id = row['category']
    category_name = category_map.get(category_id, "Unknown")
    df.at[index, 'category'] = category_name

In [240]:
# safe csv file  
df.to_csv(directory_path_new_file)  

## Youtube API

Accessing the YouTube comments of each video which is related to the topic of climate change. Therefore we take the video_id from the created dataframe to access the comments for each video.

In [241]:
# load data frame with all related videos about climate change
df = pd.read_csv(directory_path_new_file, index_col=0)


In [243]:
# Initialize an empty list to store comments
all_comments_data = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    video_id = row['video_id']
    
    # Initialize the nextPageToken variable
    next_page_token = None

    while True:
        # Make a request to fetch comments for the video
        comments_response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            textFormat="plainText",
            maxResults=100,  # You can only request up to 100 comments per page
            pageToken=next_page_token
        ).execute()

        # Iterate over each comment thread in the response
        for comment_thread in comments_response['items']:
            comment_snippet = comment_thread['snippet']['topLevelComment']['snippet']
            published_at = comment_snippet['publishedAt']
            like_count = comment_snippet['likeCount']
            comment_text = comment_snippet['textDisplay']
            author = comment_snippet['authorDisplayName']

            comment_data = {
                "video_id": video_id,
                "published_at": published_at,
                "like_count": like_count,
                "text": comment_text,
                "author": author
            }

            all_comments_data.append(comment_data)

        # Check if there is a nextPageToken in the response
        next_page_token = comments_response.get('nextPageToken')

        # If no nextPageToken, break the loop
        if not next_page_token:
            break

# Create a DataFrame from the list of comments
comments_df = pd.DataFrame(all_comments_data)

In [245]:
# safe csv file  
comments_df.to_csv(comment_file)  