# Youtube Sentiment Analysis

In the following sentiment analysis, we look at a YouTube data set with videos that were trending in the period from 13.09.2017 - 22.10.2017. We compare the trends from the UK and US and analyze the behavior of viewers from those regions.

The data set can be accessed via the following link
https://www.kaggle.com/datasets/datasnaek/youtube

Our data set consists of two csv and one json file for both regions. One of the two csv files contains all comments for each video that was trending, while the other csv file lists all videos in general. 

The json file contains information on all possible categories to which a video could belong. 

### Let's Start by importing all the necessary librarys

In [None]:
# these are the necessary imports
# !pip install openpyxl 
import numpy as np
import pandas as pd
import os
import json
from textblob import TextBlob
import re
import matplotlib.pyplot as plt
import seaborn as sns




# UK

### Loading the json File



In [None]:
def json_to_dataframe(file):
    """Function that reads a JSON file, extracts category_id and title from each item, and converts it into a Pandas DataFrame."""
    
    with open(file, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    
    # Extracting relevant information from JSON
    categories = []
    for item in data['items']:
        category_id = item['id']
        title = item['snippet']['title']
        categories.append({'category_id': category_id, 'title': title})
    
    # Creating DataFrame
    df = pd.DataFrame(categories)
    return df

cwd = os.getcwd()
filename_gb = "GB_category_id.json"
filename_us = "US_category_id.json"

# JSON-Datei in DataFrame umwandeln
gb_categories = json_to_dataframe(os.path.join(cwd, filename_gb))
us_categories = json_to_dataframe(os.path.join(cwd, filename_us))


In [None]:
gb_categories

The Json File category contains all possible categories and their corresponding ID. 

This DataFrame becomes relevant in the course of the sentiment analysis, as the category of the video is stored as an ID in the csv file.

For illustrative purposes, we want to exchange the ID with the corresponding category

### Loading both csv Files

In [None]:
# Specify the path to CSV files
csv_file_videos_gb = (os.path.join(cwd, "GBvideos.csv"))
csv_file_comments_gb = (os.path.join(cwd, "GBcomments.csv"))

csv_file_videos_us = (os.path.join(cwd, "USvideos.csv"))
csv_file_comments_us = (os.path.join(cwd, "UScomments.csv"))

# load the files
gb_videos = pd.read_csv(csv_file_videos_gb, delimiter=",", error_bad_lines=False) 
gb_comments = pd.read_csv(csv_file_comments_gb, delimiter=",", error_bad_lines=False)

us_videos = pd.read_csv(csv_file_videos_us, delimiter=",", error_bad_lines=False)
us_comments = pd.read_csv(csv_file_comments_us, delimiter=",", error_bad_lines=False)



# Display the first 5 rows of the DataFrame
gb_videos.head()
gb_videos = gb_videos.iloc[:, :11]
print(gb_videos.columns)
gb_videos.head()

## Delete emojis from the comments

### To perform the sentiment analysis properly, we need to remove special characters such as emojis from the text


In [None]:
def remove_emojis(text):
    """The function remove_emojis takes a text input and removes any emojis present in the text."""
    # Check if the input is a string
    if isinstance(text, str):
        # Define regex pattern to match emojis
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642" 
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        # Remove emojis from the text
        return emoji_pattern.sub(r'', text)
    else:
        # If input is not a string, return it unchanged
        return text

In [None]:
# Apply the remove_emojis function to each element in the "comment_text" column
gb_comments["comment_text"] = gb_comments["comment_text"].apply(remove_emojis)
us_comments["comment_text"] = us_comments["comment_text"].apply(remove_emojis)


## Mapping
### Mapping the category_Id to category_name for so we can adjust our dataset like previously described

In [None]:
# Create a mapping dictionary from category_id to category_name
category_mapping_gb = dict(zip(gb_categories['category_id'], gb_categories['title']))
category_mapping_us = dict(zip(us_categories['category_id'], us_categories['title']))



In [None]:
# Map the category_id to category_name in your main dataframe
gb_videos['category_id'] = gb_videos['category_id'].astype(str)
gb_videos['category_name'] = gb_videos['category_id'].map(category_mapping_gb)

us_videos['category_id'] = us_videos['category_id'].astype(str)
us_videos['category_name'] = us_videos['category_id'].map(category_mapping_us)


# Analyzing Data

Now that we have completely loaded and processed all the data, it is time to analyze it. To do this, we start with a few plots that give us an insight into the performance of the individual channels and videos.

We then look at the use of tags and their frequency within our data set. Finally, we perform the sentiment analysis.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from wordcloud import WordCloud, STOPWORDS


In [None]:
# Define a function to plot the top channels
def plot_top_channels(videos_df, country, color_palette, channel_color_dict):
    top_channels = videos_df["channel_title"].value_counts().head(10)
    
    # Create a DataFrame for plotting
    top_channels_df = pd.DataFrame({'channel': top_channels.index, 'appearances': top_channels.values})
    
    # Plot using seaborn
    plt.figure(figsize=(16, 9))
    sns.barplot(x='appearances', y='channel', data=top_channels_df, palette=[channel_color_dict[channel] for channel in top_channels_df['channel']])
    
    plt.xlabel("Appearances", fontsize=16)
    plt.ylabel("", fontsize=16)  # Remove the y-axis label
    plt.title(f"Top 10 Channels in YouTube Trends of {country}", fontsize=20)
    plt.gca().invert_yaxis()
    
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    
    # Add annotation to bars
    for index, value in enumerate(top_channels_df['appearances']):
        plt.text(value + 0.2, index, str(int(value)), fontsize=16, fontweight="bold", color="grey")
        plt.text(value / 2, index, top_channels_df['channel'][index], fontsize=16, fontweight="bold", color="white", ha='center')
    
    plt.show()

# Get the top 10 channels for both UK and US
top_uk_channels = gb_videos["channel_title"].value_counts().head(10).index
top_us_channels = us_videos["channel_title"].value_counts().head(10).index

# Combine and get unique channels
unique_channels = pd.Series(list(top_uk_channels) + list(top_us_channels)).unique()

# Create a color palette
color_palette = sns.color_palette("husl", len(unique_channels))
channel_color_dict = {channel: color_palette[i] for i, channel in enumerate(unique_channels)}

# Plot for UK
plot_top_channels(gb_videos, "UK", color_palette, channel_color_dict)

# Plot for US
plot_top_channels(us_videos, "US", color_palette, channel_color_dict)


In [None]:
# Define a function to plot the top channels
def plot_top_videos(videos_df, country, color_palette, video_color_dict):
    top_videos = videos_df["video_id"].value_counts().head(10)
    
    # Create a DataFrame for plotting
    top_videos_df = pd.DataFrame({'video_id': top_videos.index, 'appearances': top_videos.values})
    top_videos_df = top_videos_df.merge(videos_df[['video_id', 'title']].drop_duplicates(), on='video_id', how='left')
    
    # Plot using seaborn
    plt.figure(figsize=(16, 9))
    sns.barplot(x='appearances', y='video_id', data=top_videos_df, palette=[video_color_dict[video] for video in top_videos_df['video_id']])
    
    plt.xlabel("Appearances", fontsize=16)
    plt.ylabel("", fontsize=16)  # Remove the y-axis label
    plt.title(f"Top 10 Videos in YouTube Trends of {country}", fontsize=20)
    plt.gca().invert_yaxis()
    
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    
    # Add annotation to bars
    for index, value in enumerate(top_videos_df['appearances']):
        plt.text(value + 0.2, index, str(int(value)), fontsize=16, fontweight="bold", color="grey")
        plt.text(value / 2, index, top_videos_df['title'][index], fontsize=16, fontweight="bold", color="white", ha='center', wrap=True)
    
    plt.show()

# Get the top 10 videos for both UK and US
top_uk_videos = gb_videos["video_id"].value_counts().head(10).index
top_us_videos = us_videos["video_id"].value_counts().head(10).index

# Combine and get unique videos
unique_videos = pd.Series(list(top_uk_videos) + list(top_us_videos)).unique()

# Create a color palette
color_palette = sns.color_palette("husl", len(unique_videos))
video_color_dict = {video: color_palette[i] for i, video in enumerate(unique_videos)}

# Plot for UK
plot_top_videos(gb_videos, "UK", color_palette, video_color_dict)

# Plot for US
plot_top_videos(us_videos, "US", color_palette, video_color_dict)

In [None]:
# Looking which Tags are represented the most in UK
tags = gb_videos['tags'].map(lambda k: k.lower().split('|')).values 
k= (' '.join(gb_videos['tags']))  
wordcloud = WordCloud(width = 1000, height = 500).generate((' '.join(k.lower().split('|'))))

plt.figure(figsize=(15,5))
plt.imshow(wordcloud)
plt.axis('off')

In [None]:
# Looking which Tags are represented the most in UK
tags = us_videos['tags'].map(lambda k: k.lower().split('|')).values 
k= (' '.join(gb_videos['tags']))  
wordcloud = WordCloud(width = 1000, height = 500).generate((' '.join(k.lower().split('|'))))

plt.figure(figsize=(15,5))
plt.imshow(wordcloud)
plt.axis('off')

### Based on the plot, the three most frequently appearing categories in YouTube trends are Entertainment, Music, and People & Blogs.

If we compare both the performing channels and the videos from the USA with those from the UK in relation to the appearances in the trends, it is noticeable that categories such as entertainment and comedy are trending well. 

This also makes sense, as YouTube is a platform for entertainment and is therefore also used for this purpose. What is interesting, however, is that we can see exact overlaps in some places. For example, the video "Eminem Rips Donald Trump in BET Hip Hop Awards Freestyle Cypher" is trending multiple times in both regions. There are also similarities in the channels. "The Late Late Show with James Cordon" or "jacksfilms" are on average around 38 days with their videos in the trends in the USA and the UK. 

This suggests that viewers from the USA and the UK have the same interests when consuming YouTube videos. The USA could even be seen as a trendsetter that other countries follow.

## Preparing for Sentiment Analysis

In [None]:
# Get unique categories
categories_gb = gb_videos["category_name"].unique()
categories_us = us_videos["category_name"].unique()

# Create seperate DataFrames for each category and store them in variables
category_dataframes_gb = {}
for category in categories_gb:
    category_dataframes_gb[category] = gb_videos[gb_videos["category_name"] == category].copy()

category_dataframes_us = {}
for category in categories_us:
    category_dataframes_us[category] = us_videos[us_videos["category_name"] == category].copy()

## Creating a DataFrame for each Category



In [None]:
science_and_technology_gb = pd.DataFrame(category_dataframes_gb["Science & Technology"])
entertainment_gb = pd.DataFrame(category_dataframes_gb["Entertainment"])
film_and_animation_gb = pd.DataFrame(category_dataframes_gb["Film & Animation"])
howto_and_style_gb = pd.DataFrame(category_dataframes_gb["Howto & Style"])
sports_gb = pd.DataFrame(category_dataframes_gb["Sports"])
people_and_blogs_gb = pd.DataFrame(category_dataframes_gb["People & Blogs"])
music_gb = pd.DataFrame(category_dataframes_gb["Music"])
comedy_gb = pd.DataFrame(category_dataframes_gb["Comedy"])
education_gb = pd.DataFrame(category_dataframes_gb["Education"])
news_and_politics_gb = pd.DataFrame(category_dataframes_gb["News & Politics"])
gaming_gb = pd.DataFrame(category_dataframes_gb["Gaming"])
autos_and_vehicles_gb = pd.DataFrame(category_dataframes_gb["Autos & Vehicles"])
pets_and_animals_gb = pd.DataFrame(category_dataframes_gb["Pets & Animals"])
travel_and_events_gb = pd.DataFrame(category_dataframes_gb["Travel & Events"])

In [None]:
science_and_technology_us = pd.DataFrame(category_dataframes_us["Science & Technology"])
entertainment_us = pd.DataFrame(category_dataframes_us["Entertainment"])
film_and_animation_us = pd.DataFrame(category_dataframes_us["Film & Animation"])
howto_and_style_us = pd.DataFrame(category_dataframes_us["Howto & Style"])
sports_us = pd.DataFrame(category_dataframes_us["Sports"])
people_and_blogs_us = pd.DataFrame(category_dataframes_us["People & Blogs"])
music_us = pd.DataFrame(category_dataframes_us["Music"])
comedy_us = pd.DataFrame(category_dataframes_us["Comedy"])
education_us = pd.DataFrame(category_dataframes_us["Education"])
news_and_politics_us = pd.DataFrame(category_dataframes_us["News & Politics"])
gaming_us = pd.DataFrame(category_dataframes_us["Gaming"])
autos_and_vehicles_us = pd.DataFrame(category_dataframes_us["Autos & Vehicles"])
pets_and_animals_us = pd.DataFrame(category_dataframes_us["Pets & Animals"])
travel_and_events_us = pd.DataFrame(category_dataframes_us["Travel & Events"])

In [None]:
category_dataframes = [entertainment_gb, music_gb, people_and_blogs_gb, howto_and_style_gb]
category_names = ['Entertainment', 'Music', 'People & Blogs', 'Howto & Style']

# Create bar plots for the amount of views for each category separately
plt.figure(figsize=(10, 6))
for i, category_df in enumerate(category_dataframes):
    # Calculate the total views for each category
    total_views = category_df['views'].sum()
    plt.bar(category_names[i], total_views, color='skyblue')

# Add labels and title
plt.title('Total Views by Category')
plt.xlabel('Category')
plt.ylabel('Total Views')

# Show the plot

plt.tight_layout()
plt.show()

In [None]:
category_dataframes = [entertainment_us, music_us, people_and_blogs_us, howto_and_style_us]
category_names = ['Entertainment', 'Music', 'People & Blogs', 'Howto & Style']

# Create bar plots for the amount of views for each category separately
plt.figure(figsize=(10, 6))
for i, category_df in enumerate(category_dataframes):
    # Calculate the total views for each category
    total_views = category_df['views'].sum()
    plt.bar(category_names[i], total_views, color='skyblue')

# Add labels and title
plt.title('Total Views by Category')
plt.xlabel('Category')
plt.ylabel('Total Views')

# Show the plot

plt.tight_layout()
plt.show()

## SentimentIntensityAnalyzer
In our sentiment analysis, we want to analyze the mood of the comments in the various categories. To do this, we create a separate dataframe for each category. We then look at the "comment_text" column to carry out the sentiment analysis accordingly and plot the results.

The SentimentIntensityAnalyzer is a component of the VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analysis tool. VADER is specifically attuned to sentiments expressed in social media. It works well on text from various domains, including social media posts, news articles, and even longer narrative texts.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [None]:
def add_compound(category, comments, csv_name):
    """
    
    The add_compound function integrates sentiment analysis into a dataset by:

    1. Merging categories and comments based on video_id.
    2. Computing sentiment scores for each comment using VADER.
    3. Adding these scores to the original dataset.
    4. Saving the enhanced dataset to a CSV file and returning it.
    5. The primary output is a DataFrame that includes sentiment scores (including the "compound" score) for each comment,
    providing insight into the emotional tone of the comments.

    """
    sia = SentimentIntensityAnalyzer()

    category_comments = pd.merge(category, comments[["video_id", "comment_text"]], on="video_id", how="left")
    #print(category_comments.head())
    category_comments = category_comments.reset_index()
    category_comments = category_comments.rename(columns={'index': 'ID'})

    results = {}
    for i, row in tqdm(category_comments.iterrows(), total=len(category_comments)):
        text = row["comment_text"]
        myid = i
        results[myid] = sia.polarity_scores(str(text))

    vaders = pd.DataFrame(results).T
    vaders = vaders.reset_index().rename(columns={"index": "ID"})
    #vaders = vaders.drop(["neg", "neu", "pos"], axis=1)
    
    category_comments = vaders.merge(category_comments, how="left")
    #category_comments = category_comments.drop(["ID"], axis=1)
    # Add print statement to check if the "compound" column exists
    print("Columns after merging:", category_comments.columns)
    category_comments.to_csv(csv_name, index=False)


    return category_comments

In [None]:
# Getting the Compound Values for every video from UK
gb_videos_comments = add_compound(gb_videos, gb_comments,"gb_videos_comments.csv")

In [None]:
# Getting the Compound Values for every video from US
us_videos_commments = add_compound(us_videos, us_comments, "us_video_comments.csv")

In [None]:
science_and_technology_comments_gb = add_compound(science_and_technology_gb, gb_comments,"science_and_technology_comments_gb.csv" )
entertainment_comments_gb = add_compound(entertainment_gb, gb_comments,"entertainment_comments_gb.csv")
film_and_animation_comments_gb = add_compound(film_and_animation_gb, gb_comments, "film_and_animation_comments_gb.csv")
howto_and_style_comments_gb = add_compound(howto_and_style_gb, gb_comments, "howto_and_style_comments_gb.csv")
sports_comments_gb = add_compound(sports_gb, gb_comments, "sports_comments_gb.csv")
people_and_blogs_comments_gb = add_compound(people_and_blogs_gb, gb_comments, "people_and_blogs_comments_gb.csv")
music_comments_gb = add_compound(music_gb, gb_comments, "music_comments_gb.csv")
comedy_comments_gb = add_compound(comedy_gb, gb_comments, "comedy_comments_gb.csv")
education_comments_gb = add_compound(education_gb, gb_comments, "education_comments_gb.csv")
news_and_politics_comments_gb = add_compound(news_and_politics_gb, gb_comments, "news_and_politics_comments_gb.csv")
gaming_comments_gb = add_compound(gaming_gb, gb_comments, "gaming_comments_gb.csv")
autos_and_vehicles_comments_gb = add_compound(autos_and_vehicles_gb, gb_comments, "autos_and_vehicles_comments_gb.csv")
pets_and_animals_comments_gb = add_compound(pets_and_animals_gb, gb_comments, "pets_and_animals_comments_gb.csv")
travel_and_events_comments_gb = add_compound(travel_and_events_gb, gb_comments, "travel_and_events_comments_gb.csv") 

# Dauer ca. 11min (entspannte drei Clash Royale Runden)

In [None]:
science_and_technology_comments_us = add_compound(science_and_technology_us, us_comments,"science_and_technology_comments_us.csv" )
entertainment_comments_us = add_compound(entertainment_us, us_comments,"entertainment_comments_us.csv")
film_and_animation_comments_us = add_compound(film_and_animation_us, us_comments, "film_and_animation_comments_us.csv")
howto_and_style_comments_us = add_compound(howto_and_style_us, us_comments, "howto_and_style_comments_us.csv")
sports_comments_us = add_compound(sports_us, us_comments, "sports_comments_us.csv")
people_and_blogs_comments_us = add_compound(people_and_blogs_us, us_comments, "people_and_blogs_comments_us.csv")
music_comments_us = add_compound(music_us, us_comments, "music_comments_us.csv")
comedy_comments_us = add_compound(comedy_us, us_comments, "comedy_comments_us.csv")
education_comments_us = add_compound(education_us, us_comments, "education_comments_us.csv")
news_and_politics_comments_us = add_compound(news_and_politics_us, us_comments, "news_and_politics_comments_us.csv")
gaming_comments_us = add_compound(gaming_us, us_comments, "gaming_comments_us.csv")
autos_and_vehicles_comments_us = add_compound(autos_and_vehicles_us, us_comments, "autos_and_vehicles_comments_us.csv")
pets_and_animals_comments_us = add_compound(pets_and_animals_us, us_comments, "pets_and_animals_comments_us.csv")
travel_and_events_comments_us = add_compound(travel_and_events_us, us_comments, "travel_and_events_comments_us.csv") 

# Dauer ca. 11min (entspannte drei Clash Royale Runden)

In [None]:
metric = "compound"

dict_of_compounds_gb = {"Entertainment": entertainment_comments_gb[metric],
                     "Science and Technology": science_and_technology_comments_gb[metric],
                     "Film and Animation": film_and_animation_comments_gb[metric],
                     "Howto and Style": howto_and_style_comments_gb[metric],
                     "Sports": sports_comments_gb[metric],
                     "People and Blogs": people_and_blogs_comments_gb[metric],
                     "Music": music_comments_gb[metric],
                     "Comedy": comedy_comments_gb[metric],
                     "Education": education_comments_gb[metric],
                     "News and Politics": news_and_politics_comments_gb[metric],
                     "Gaming": gaming_comments_gb[metric],
                     "Autos and vehicles": autos_and_vehicles_comments_gb[metric],
                     "Pets and Animals": pets_and_animals_comments_gb[metric],
                     "Travel and Events": travel_and_events_comments_gb[metric]}
len(dict_of_compounds_gb)

In [None]:
metric = "compound"

dict_of_compounds_us = {"Entertainment": entertainment_comments_us[metric],
                     "Science and Technology": science_and_technology_comments_us[metric],
                     "Film and Animation": film_and_animation_comments_us[metric],
                     "Howto and Style": howto_and_style_comments_us[metric],
                     "Sports": sports_comments_us[metric],
                     "People and Blogs": people_and_blogs_comments_us[metric],
                     "Music": music_comments_us[metric],
                     "Comedy": comedy_comments_us[metric],
                     "Education": education_comments_us[metric],
                     "News and Politics": news_and_politics_comments_us[metric],
                     "Gaming": gaming_comments_us[metric],
                     "Autos and vehicles": autos_and_vehicles_comments_us[metric],
                     "Pets and Animals": pets_and_animals_comments_us[metric],
                     "Travel and Events": travel_and_events_comments_us[metric]}
len(dict_of_compounds_us)

In [None]:
metric = "compound"
dict_of_compounds = {"sports": sports_comments_gb[metric], "entertainment": entertainment_comments_gb[metric]}

# VADERS Results UK

In [None]:
bins = [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75, 1]


num_rows = 2  # Number of rows of plots
num_cols = 7  # Number of columns of plots

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 5))

# Iterate through the dictionary and plot each metric for different categories
for i, (category, df) in enumerate(dict_of_compounds_gb.items()):
    ax = axes[i // num_cols, i % num_cols]  # Determine the corresponding axis
    ax.hist(df, bins=bins, color="skyblue", edgecolor="black")  # Plot histogram for the metric
    ax.set_title(category)  # Set the title for each plot

    median_value = df.median()
    ax.axvline(median_value, color='red', linestyle='--', linewidth=2, label=f'Median: {median_value:.2f}')
    ax.legend()  # Show legend

plt.tight_layout()  # Optimize the arrangement of plots
plt.show()

# VADERS RESULT US

In [None]:
bins = [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75, 1]


num_rows = 2  # Number of rows of plots
num_cols = 7  # Number of columns of plots

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 5))

# Iterate through the dictionary and plot each metric for different categories
for i, (category, df) in enumerate(dict_of_compounds_us.items()):
    ax = axes[i // num_cols, i % num_cols]  # Determine the corresponding axis
    ax.hist(df, bins=bins, color="skyblue", edgecolor="black")  # Plot histogram for the metric
    ax.set_title(category)  # Set the title for each plot

    median_value = df.median()
    ax.axvline(median_value, color='red', linestyle='--', linewidth=2, label=f'Median: {median_value:.2f}')
    ax.legend()  # Show legend

plt.tight_layout()  # Optimize the arrangement of plots
plt.show()