# Current Topics and Projects
## 1. Data retrieving (Reichelt, Tagesschau, Jung&Naiv)

In [1]:
'''
This is the first file to run which requests the data, be aware of the limits of the key.

'''

import requests
import json
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
#from youtube_transcript_api import YouTubeTranscriptApi
from datetime import datetime, timezone, timedelta
import spacy
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures


In [54]:
# private api_key
# Peer: 
api_key_peer1 = '' 
api_key_peer2 = '' 
api_key_peer3 = ''

# Raphaela:
api_key = '' 

# Define the base API URL
base_url = "https://www.googleapis.com/youtube/v3/channels"

# Set the API endpoint
search_endpoint = "https://www.googleapis.com/youtube/v3/search"

# Set the API endpoint
video_endpoint = 'https://www.googleapis.com/youtube/v3/videos'

# Set the API endpoint
caption_endpoint = "https://www.googleapis.com/youtube/v3/captions"

# Set the API endpoint
comment_endpoint = "https://www.googleapis.com/youtube/v3/commentThreads"

# Set the API endpoint
replies_endpoint = "https://www.googleapis.com/youtube/v3/comments"

In [55]:
# set date, in order to get not the newest videos without comments
# now = datetime.now(timezone.utc)
# get timestamp of 5 days earlier
# five_days_ago = now - timedelta(days=10)
# change format
# datetime_string = five_days_ago.strftime("%Y-%m-%dT%H:%M:%SZ")
# print(datetime_string)

# take 31. of May as stable limit
datetime_string_end = '2023-01-1T15:07:51Z'
datetime_string_sart = '2022-01-1T15:07:51Z'

In [56]:
# Set the parameters for the API request
def video_request(key, channel_id, endpoint):
    '''
    Function to get information about videos
    '''
    params = {
        "key": key,
        "channelId": channel_id,
        "part": "snippet",
        "order": "date",
        "type": "video",
        "maxResults": 50,
        "publishedBefore": datetime_string_end,
        "publishedAfter" : datetime_string_sart
        
    }

    # Send the API request
    response = requests.get(endpoint, params=params)

    # Parse the response and store video information in a dictionary
    video_dict = {}

    if response.status_code == 200:
        data = response.json()
        videos = data["items"]

        while "nextPageToken" in data:
            next_page_token = data["nextPageToken"]
            params["pageToken"] = next_page_token
            response = requests.get(endpoint, params=params)
            if response.status_code == 200:
                data = response.json()
                videos += data["items"]
                
            else:
                print("Error occurred while fetching comments.")
                print(response.status_code)
                break
                    

        # Process the video information
        for video in videos:
            video_id = video["id"]["videoId"]
            video_title = video["snippet"]["title"]
            video_published_at = video["snippet"]["publishedAt"]
            
            # Store the video information in the dictionary
            video_dict[video_id] = {
                "title": video_title,
                "published_at": video_published_at
            }

    else:
        print("Error occurred while fetching the videos.")
        print(response.status_code)

    video_ids = []

    for video_id, video_info in video_dict.items():
        video_ids.append(video_id)

    return video_dict, video_ids



In [57]:
def video_transscrip(vid): 
    '''
    This function utilizes the YouTubeTranscriptApi as the Youtube Data Api from google doesnt provide transcripts
    '''
    try:
        # Retrieve the transcript
        transcript = YouTubeTranscriptApi.get_transcripts([vid], languages=['de'])

        # Combine transcript text into a single string
        transcript_text = ' '.join(entry['text'] for entry in transcript[0][vid])

    except: 
        print(f'Video {vid} has no transcript')
        return 0 
      
            
    return transcript_text
    

In [58]:
def video_stats(api_key, video_ids, endpoint):
    '''
     Function to retrieve the video statistics for certain video_ids via the API
    '''
    # Initialize an empty dictionary to store the results
    vid_stats = {}

    params = {
        'part': 'snippet,statistics,contentDetails',
        'key': api_key
    }

    # Process each batch of video IDs
    for i in video_ids:
        params['id'] = i
        response = requests.get(endpoint, params=params)
        if response.status_code == 200:
                data = response.json()
                vid_stats[i] = data
                
        else:
            print("Error occurred while fetching videos.")
            print(response.status_code)
            
    return vid_stats



In [59]:
def comment_request(api_key, video_ids, endpoint, replies_endpoint):
    '''
    Function to retrieve the comments for certain video_ids via the API
    '''

    # Dictionary to store comment data
    comments_dict = {}

    # Iterate over the video IDs
    for video_id in video_ids:
        # Set the parameters for the API request
        params = {
            "key": api_key,
            "videoId": video_id,
            "part": "id,snippet,replies",
            "maxResults": 100  # Set the desired number of comments per video
        }

        # Send the API request
        response = requests.get(endpoint, params=params)
        
        # Parse the response and retrieve comment information
        if response.status_code == 200:
            data = response.json()
            comment_thread = data["items"]
            while "nextPageToken" in data:
                next_page_token = data["nextPageToken"]
                params["pageToken"] = next_page_token
                response = requests.get(endpoint, params=params)
                if response.status_code == 200:
                    data = response.json()
                    comment_thread += data["items"]
                    
                else:
                    print("Error occurred while fetching comments.")
                    print(response.status_code)
                    break
        else:
            print("Error occurred while fetching comments for video ID.")
            print(response.status_code)
        
        for i in comment_thread:
            if i['snippet']['totalReplyCount'] > 0:
                if i['snippet']['totalReplyCount'] > len(i['replies']["comments"]):           
                    replies_params = {
                        "key": api_key,
                        "part": "snippet",
                        "parentId": i["id"]}
                    replies_response = requests.get(replies_endpoint, params=replies_params)
                    data = replies_response.json()
                    i["replies"]["comments"] = data
            
        comments_dict[video_id] =  comment_thread

    return comments_dict


In [10]:
'''
The following code requests all the videos and transcripts of videos in 2022. Additionaly we calculate the cosine simularity 
of each combinations and pick the 50 best unique combinations 
'''

channel_ids = ["UCcoQ3WG2J_Xjwwyt-sJqh-w", 'UCMIgOXM2JEQ2Pv2d0_PVfcg', 'UCv1WDP5EiipMQ__C4Cg6aow']

merged_trans = []
merged_id = []
merged_channel = []

#get all videos 2022
for i in channel_ids:
    video_dict, video_ids = video_request(api_key_peer3, i, search_endpoint)
    print(len(video_ids))
    progress_bar = tqdm(video_ids, total=len(video_ids))
    for x in progress_bar:
        trans = video_transscrip(x)
        merged_trans.append(trans)
        merged_id.append(x)
        merged_channel.append(i)
        
# Create a DataFrame from the lists
data = {'video_id': merged_id, 'channel_id': merged_channel, 'transcript': merged_trans}

#drop na and cache data
df = pd.DataFrame(data)
df = df.dropna(how='all')
df = df[df['transcript'] != '0']

df.to_csv("data/raw/vids.csv", index=False)


228


  8%|▊         | 19/228 [00:09<01:45,  1.98it/s]

Video YOVcvB0Qyk4 has no transcript


 82%|████████▏ | 187/228 [01:40<00:20,  2.03it/s]

Video P3TNooHlCF4 has no transcript


 96%|█████████▌| 219/228 [01:57<00:04,  2.08it/s]

Video 5mLwMCrjZR8 has no transcript


100%|██████████| 228/228 [02:02<00:00,  1.86it/s]


495


 12%|█▏        | 58/495 [00:34<04:04,  1.79it/s]

Video -RjfBT2tEIk has no transcript


 26%|██▋       | 130/495 [01:20<03:32,  1.72it/s]

Video 2N5bNh-uKHs has no transcript


 29%|██▊       | 142/495 [01:27<03:12,  1.84it/s]

Video _2yMkTe5ypM has no transcript


 36%|███▌      | 177/495 [01:48<03:06,  1.71it/s]

Video GLUdfBl0oGo has no transcript


 53%|█████▎    | 264/495 [02:42<02:11,  1.76it/s]

Video p09tj1QqCFs has no transcript


 64%|██████▎   | 315/495 [03:13<01:47,  1.68it/s]

Video wN_XFvqu8uI has no transcript


 77%|███████▋  | 383/495 [03:58<01:02,  1.79it/s]

Video 68bQqiJte5U has no transcript


 91%|█████████ | 450/495 [04:42<00:28,  1.58it/s]

Video PqGyFtwNXt0 has no transcript


 98%|█████████▊| 483/495 [05:02<00:06,  1.81it/s]

Video HB0UWu5isng has no transcript


 98%|█████████▊| 485/495 [05:03<00:05,  1.74it/s]

Video PrIkPu3wAk8 has no transcript


100%|██████████| 495/495 [05:10<00:00,  1.60it/s]


278


  0%|          | 1/278 [00:00<02:06,  2.19it/s]

Video hNp_XJUZEfU has no transcript


 38%|███▊      | 106/278 [01:11<01:42,  1.69it/s]

Video mL2TWzrvlYE has no transcript


100%|██████████| 278/278 [03:02<00:00,  1.52it/s]


In [12]:
'''
This piece of code calculates the cosine simuarity odf all videos from channel A to B and from channel A to C 
and then picks the closes combination of ABC videos by merging via A. This is computational expensiv but still better then check for every
ABC combination
'''
# Load the SpaCy model (make sure to download and install the model beforehand)
nlp = spacy.load("de_core_news_md")

# Get unique channelIds
unique_channelIds = df["channel_id"].unique()

# Dictionary to store the best matches for each channel
matches1 = {}
matches2 = {}

# Get video IDs and transcripts for each channelId in the combination
video_data = [df[df['channel_id'] == channelId]['video_id'].tolist() for channelId in unique_channelIds]

# Generate combinations of video data within each channelId
video_combinationsAB = list(product(video_data[0], video_data[1]))
video_combinationsAC = list(product(video_data[0], video_data[2]))

# Define a function to calculate similarity
def calculate_similarity(transcripts):
    return nlp(transcripts[0]).similarity(nlp(transcripts[1]))

with ThreadPoolExecutor() as executor:
    futures1 = {combination: executor.submit(calculate_similarity, [df[df['video_id'] == video]["transcript"].values[0] for video in combination]) for combination in video_combinationsAB}
    futures2 = {combination: executor.submit(calculate_similarity, [df[df['video_id'] == video]["transcript"].values[0] for video in combination]) for combination in video_combinationsAC}
    
    for combination, future in tqdm(futures1.items()):
        matches1[combination] = future.result()

    for combination, future in tqdm(futures2.items()):
        matches2[combination] = future.result()

100%|██████████| 112860/112860 [4:47:06<00:00,  6.55it/s]  
100%|██████████| 63384/63384 [12:05:38<00:00,  1.46it/s]  


In [28]:
'''
This piece of code sorts the matches and picks the best and unique video ids
'''

# Step 1: Sort dictionaries based on similarity
sorted_matches1 = {k: v for k, v in sorted(matches1.items(), key=lambda item: item[1], reverse=True)}
sorted_matches2 = {k: v for k, v in sorted(matches2.items(), key=lambda item: item[1], reverse=True)}
sorted_matches2 = {key: value for key, value in sorted_matches2.items() if value != 1.0}
sorted_matches1 = {key: value for key, value in sorted_matches1.items() if value != 1.0}

# Convert tuples to strings for keys
sort1_str = {str(key): value for key, value in sorted_matches1.items()}
sort2_str = {str(key): value for key, value in sorted_matches2.items()}

# Save the filtered dictionary to a JSON file
with open("data/raw/sort2", 'w') as outfile:
    json.dump(sort2_str, outfile, indent=4)
    
# Save the filtered dictionary to a JSON file
with open("data/raw/sort1", 'w') as outfile:
    json.dump(sort1_str, outfile, indent=4)

def filter_best_matches(matches):
    best_matches = {}  # Initialize an empty dictionary to store the best matches
    
    for (a, b), value in matches.items():
        # Check if both a and b are completely new
        if a not in best_matches.keys() and b not in [v[0] for v in best_matches.values()]:
            # Add the new entry to best_matches
            best_matches[a] = (b, value)
    
    return best_matches


filtered_matches1 = filter_best_matches(sorted_matches1)
filtered_matches2 = filter_best_matches(sorted_matches2)


vid_id_jung = []
vid_id_dw = []
vid_id_reichelt = []

for key1, (b1, value1) in filtered_matches1.items():
    for key2, (b2, value2) in filtered_matches2.items():
        if (
            key1 == key2
            and len(vid_id_reichelt) <= 50
            and key1 not in vid_id_reichelt
            and b1 not in vid_id_dw
            and b2 not in vid_id_jung
        ):
            vid_id_reichelt.append(key1)
            vid_id_dw.append(b1)
            vid_id_jung.append(b2)
            
# Specify the file paths where you want to save the lists
output_file_jung = 'data/ids/vid_id_jung.txt'
output_file_dw = 'data/ids/vid_id_dw.txt'
output_file_reichelt = 'data/ids/vid_id_reichelt.txt'

# Write the contents of the lists to the respective files
with open(output_file_jung, 'w') as f:
    for item in vid_id_jung:
        f.write("%s\n" % item)

with open(output_file_dw, 'w') as f:
    for item in vid_id_dw:
        f.write("%s\n" % item)

with open(output_file_reichelt, 'w') as f:
    for item in vid_id_reichelt:
        f.write("%s\n" % item)

print("Lists saved to files:", output_file_jung, output_file_dw, output_file_reichelt)

Lists saved to files: Data/ids/vid_id_jung.txt Data/ids/vid_id_dw.txt Data/ids/vid_id_reichelt.txt


In [29]:
'''
1. Julian Reichelt
'''

# Set the channel ID of AchtungReichelt
channel_name = "Achtung, Reichelt!"

channel_id = "UCcoQ3WG2J_Xjwwyt-sJqh-w"

# Prepare the parameters for the API request
params = {
    "part": "statistics",
    "id": channel_id,
    "part": "snippet",
    "key": api_key
}

response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
    # Extract the statistics for each channel
    data = response.json()
    print("Step1 done")
    


# Initialize a list to store the lines from the text file
vid_id_reichelt = []

# Open the text file and read its contents
with open("data/ids/vid_id_reichelt.txt", 'r', encoding='utf-8') as txtfile:
    vid_id_reichelt  = [line.strip() for line in txtfile.readlines()]    

print("Step2 done")
comments_dict_reichelt = comment_request(api_key_peer1, vid_id_reichelt, comment_endpoint, replies_endpoint)
print("Step3 done")
video_statistics = video_stats(api_key_peer1, vid_id_reichelt, video_endpoint)
print("Step4 done")

Step1 done
Step2 done
Step3 done
Step4 done


In [30]:
# Save the comments_dict to a file
filename = "data/raw/comments_dict_reichelt.json"

with open(filename, "w") as file:
    json.dump(comments_dict_reichelt, file)


# Save vid_stats as JSON file
with open('data/raw/video_stat_reichelt.json', 'w') as json_file:
    json.dump(video_statistics, json_file)
            

In [60]:
'''
2. dw
'''
channel_id = 'UCMIgOXM2JEQ2Pv2d0_PVfcg'

# Prepare the parameters for the API request
params = {
    "part": "statistics",
    "id": channel_id,
    "part": "snippet",
    "key": api_key
}

response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
    # Extract the statistics for each channel
    data = response.json()
    print("Step1 done")
    
    
# Initialize a list to store the lines from the text file
vid_id_dw = []

# Open the text file and read its contents
with open("data/ids/vid_id_dw.txt", 'r', encoding='utf-8') as txtfile:
    vid_id_dw  = [line.strip() for line in txtfile.readlines()]

    
print("Step2 done")
comments_dict_dw = comment_request(api_key, vid_id_dw, comment_endpoint,replies_endpoint)
print("Step3 done")
video_statistics_dw = video_stats(api_key, vid_id_dw, video_endpoint)
print("Step4 done")

Step1 done
Step2 done
Step3 done
Step4 done


In [61]:
# Save the comments_dict to a file
filename = "data/raw/comments_dict_dw.json"

with open(filename, "w") as file:
    json.dump(comments_dict_dw, file)


# Save vid_stats as JSON file
with open('data/raw/video_stat_dw.json', 'w') as json_file:
    json.dump(video_statistics_dw, json_file)
    
print("saving done")

saving done


In [33]:
'''
3. Jung&Naiv
'''
channel_id = 'UCv1WDP5EiipMQ__C4Cg6aow'

# Prepare the parameters for the API request
params = {
    "part": "statistics",
    "id": channel_id,
    "part": "snippet",
    "key": api_key
}

response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
    # Extract the statistics for each channel
    data = response.json()
    print("Step1 done")

    
# Initialize a list to store the lines from the text file
vid_id_jung = []

# Open the text file and read its contents
with open("data/ids/vid_id_jung.txt", 'r', encoding='utf-8') as txtfile:
    vid_id_jung  = [line.strip() for line in txtfile.readlines()]
    
print("Step2 done")
comments_dict_jung = comment_request(api_key_peer3,vid_id_jung, comment_endpoint,replies_endpoint)
print("Step3 done")
video_statistics_jung = video_stats(api_key_peer3, vid_id_jung, video_endpoint)
print("Step4 done")


Step1 done
Step2 done
Step3 done
Step4 done


In [34]:
# Save the comments_dict to a file
filename = "data/raw/comments_dict_jung.json"

with open(filename, "w") as file:
    json.dump(comments_dict_jung, file)



# Save vid_stats as JSON file
with open('Data/raw/video_stat_jung.json', 'w') as json_file:
    json.dump(video_statistics_jung, json_file)
    
print("data saved")

data saved
