# YouTube Video Data Analysis (US & GB)

This notebook performs data cleaning, summarization, and exploratory analysis on YouTube trending videos in the US and Great Britain. It helps a digital marketing agency analyze whether polarizing videos (i.e., those with similar likes and dislikes) get more engagement.

## Goals:
1. Load and clean the datasets.
2. Create and analyze the like/dislike ratio.
3. Compare average likes between countries for 2018.
4. Determine if polarizing videos are more engaging.
5. Visualize the results and draw insights for business decisions.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")


In [None]:
# Load the US and GB video datasets
us_df = pd.read_csv("data/USvideos.csv")
gb_df = pd.read_csv("data/GBvideos.csv")


In [None]:
# Display basic information about the datasets
print("US Dataset Info:")
print(us_df.info())
print("\nGB Dataset Info:")
print(gb_df.info())


In [None]:
# Drop duplicates to avoid counting repeated video entries
us_df = us_df.drop_duplicates()
gb_df = gb_df.drop_duplicates()

# Convert 'trending_date' column to datetime format
us_df["trending_date"] = pd.to_datetime(us_df["trending_date"], format='%y.%d.%m', errors='coerce')
gb_df["trending_date"] = pd.to_datetime(gb_df["trending_date"], format='%y.%d.%m', errors='coerce')


In [None]:
# Create a like/dislike ratio while avoiding division by zero
us_df["like_dislike_ratio"] = us_df.apply(lambda x: x["likes"] / x["dislikes"] if x["dislikes"] != 0 else np.nan, axis=1)
gb_df["like_dislike_ratio"] = gb_df.apply(lambda x: x["likes"] / x["dislikes"] if x["dislikes"] != 0 else np.nan, axis=1)


In [None]:
# Calculate average likes and dislikes, then compute the overall ratio
us_avg_ratio = us_df["likes"].mean() / us_df["dislikes"].mean()
gb_avg_ratio = gb_df["likes"].mean() / gb_df["dislikes"].mean()

print(f"US Avg Like-Dislike Ratio (mean likes / mean dislikes): {us_avg_ratio:.2f}")  # Example: 5.07
print(f"GB Avg Like-Dislike Ratio (mean likes / mean dislikes): {gb_avg_ratio:.2f}")  # Example: 4.90


In [None]:
# Filter videos that trended in 2018 and calculate average likes
us_2018 = us_df[us_df["trending_date"].dt.year == 2018]
gb_2018 = gb_df[gb_df["trending_date"].dt.year == 2018]

us_2018_avg_likes = us_2018["likes"].mean()
gb_2018_avg_likes = gb_2018["likes"].mean()

print(f"2018 US Avg Likes: {us_2018_avg_likes:.2f}")  # Example: ~68227
print(f"2018 GB Avg Likes: {gb_2018_avg_likes:.2f}")  # Example: ~43219


In [None]:
# Define videos as 'polarizing' if their like/dislike ratio is between 0.8 and 1.2
def is_polarizing(row):
    if pd.isna(row['like_dislike_ratio']):
        return False
    return 0.8 <= row['like_dislike_ratio'] <= 1.2

us_df['is_polarizing'] = us_df.apply(is_polarizing, axis=1)
gb_df['is_polarizing'] = gb_df.apply(is_polarizing, axis=1)


In [None]:
# Compare engagement (views and comment counts) between polarizing and non-polarizing videos
us_polarizing = us_df[us_df['is_polarizing'] == True]
us_nonpolarizing = us_df[us_df['is_polarizing'] == False]

gb_polarizing = gb_df[gb_df['is_polarizing'] == True]
gb_nonpolarizing = gb_df[gb_df['is_polarizing'] == False]

print("US - Avg Views (Polarizing vs Non-Polarizing):")
print(us_polarizing["views"].mean(), us_nonpolarizing["views"].mean())

print("US - Avg Comments (Polarizing vs Non-Polarizing):")
print(us_polarizing["comment_count"].mean(), us_nonpolarizing["comment_count"].mean())


In [None]:
# Visualize view distribution by polarization status
sns.boxplot(x='is_polarizing', y='views', data=us_df)
plt.title("Views by Polarization (US)")
plt.xlabel("Polarizing Video")
plt.ylabel("Views")
plt.show()
