In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [None]:
us_df = pd.read_csv("data/USvideos.csv")
gb_df = pd.read_csv("data/GBvideos.csv")


In [None]:
print(us_df.info())
print(us_df.describe())
print(us_df.isnull().sum())


In [None]:
us_df = us_df.drop_duplicates()
gb_df = gb_df.drop_duplicates()

us_df["trending_date"] = pd.to_datetime(us_df["trending_date"], format='%y.%d.%m', errors='coerce')
gb_df["trending_date"] = pd.to_datetime(gb_df["trending_date"], format='%y.%d.%m', errors='coerce')


In [None]:
us_df["like_dislike_ratio"] = us_df.apply(lambda x: x["likes"] / x["dislikes"] if x["dislikes"] != 0 else np.nan, axis=1)
gb_df["like_dislike_ratio"] = gb_df.apply(lambda x: x["likes"] / x["dislikes"] if x["dislikes"] != 0 else np.nan, axis=1)


In [None]:
us_avg_ratio = us_df["likes"].mean() / us_df["dislikes"].mean()
gb_avg_ratio = gb_df["likes"].mean() / gb_df["dislikes"].mean()

print(f"US Avg Like-Dislike Ratio (mean likes / mean dislikes): {us_avg_ratio:.2f}")
print(f"GB Avg Like-Dislike Ratio (mean likes / mean dislikes): {gb_avg_ratio:.2f}")


In [None]:
us_2018 = us_df[us_df["trending_date"].dt.year == 2018]
gb_2018 = gb_df[gb_df["trending_date"].dt.year == 2018]

us_2018_avg_likes = us_2018["likes"].mean()
gb_2018_avg_likes = gb_2018["likes"].mean()

print(f"2018 US Avg Likes: {us_2018_avg_likes:.2f}")
print(f"2018 GB Avg Likes: {gb_2018_avg_likes:.2f}")


In [None]:
def is_polarizing(row):
    if pd.isna(row['like_dislike_ratio']):
        return False
    return 0.8 <= row['like_dislike_ratio'] <= 1.2

us_df['is_polarizing'] = us_df.apply(is_polarizing, axis=1)
gb_df['is_polarizing'] = gb_df.apply(is_polarizing, axis=1)


In [None]:
us_polarizing = us_df[us_df['is_polarizing'] == True]
us_nonpolarizing = us_df[us_df['is_polarizing'] == False]

gb_polarizing = gb_df[gb_df['is_polarizing'] == True]
gb_nonpolarizing = gb_df[gb_df['is_polarizing'] == False]

print("US - Avg Views:")
print(us_polarizing["views"].mean(), us_nonpolarizing["views"].mean())

print("US - Avg Comments:")
print(us_polarizing["comment_count"].mean(), us_nonpolarizing["comment_count"].mean())


In [None]:
sns.boxplot(x='is_polarizing', y='views', data=us_df)
plt.title("Views by Polarization (US)")
plt.show()
