In [None]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Directory to save plots
save_dir = './Trending Video Likes to Views Ratio Analysis PLOTS/'

## Trending Videos by Year

In [None]:
# Read in yearly data
total_2018_df = pd.read_csv('./transformed-data/data_2018_all.csv')
total_2020_df = pd.read_csv('./transformed-data/data_2020_all.csv')

### Total Trending Videos

In [None]:
# Total trending videos yearly
totals = [total_2018_df.shape[0], total_2020_df.shape[0]]  # number of rows of each year

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, totals, align='center')
ax.set_ylabel('Total Trending Videos')
ax.set_xlabel('Years')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Total Trending Videos in 2018 and 2020'
ax.set_title(title)
ax.yaxis.grid(True)
plt.tight_layout()

# Save plot to file
# file_title = title + '.png'
# plt.savefig(save_dir + file_title, dpi=100,  transparent=False)

### Total Trending Videos by Category by Year

In [None]:
# Total trending videos by category by year
cats_count_2018_series = total_2018_df.groupby('cat_name').count()['title']
cats_count_2020_series = total_2020_df.groupby('cat_name').count()['title']

# Get list of all unique categories
cats = set()
for cat in cats_count_2018_series.keys():
    cats.add(cat)
for cat in cats_count_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in cats_count_2018_series:
        sum_2018 = cats_count_2018_series[cat]
    else: 0
    
    if cat in cats_count_2020_series:
        sum_2020 = cats_count_2020_series[cat]
    else: 0

    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
ax.yaxis.grid(True)
ax.set_ylabel('Total Trending Videos')
ax.set_xlabel('Video Categories')
title = 'Total Trending Videos by Category and Year'
ax.set_title(title)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.tight_layout()

file_title = title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Number of Videos per Category

In [None]:
total_cat_count_merged =  cats_count_2018_series.to_frame().merge(cats_count_2020_series, on='cat_name')
total_cat_count_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "title_x": "Trending Videos 2018", "title_y": "Trending Videos 2018"})

### Average Likes to Views Ratio by Year

In [None]:
# Avg likes to view ratio by year
ratio_2018_series = total_2018_df['likes_to_views'].mean()
ratio_2020_series = total_2020_df['likes_to_views'].mean()
data = [ratio_2018_series, ratio_2020_series]

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, data, align='center')
ax.set_ylabel('Average Likes to View Ratio')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Average Likes to View Ratio by Year'
ax.set_title(title)
ax.yaxis.grid(True)
plt.tight_layout()

# Save plot to file
# file_title = title + '.png'
# plt.savefig(save_dir + file_title, dpi=100)

### Summary of All Data

In [None]:
# Summary of likes to views data
print("Total Views per video 2018: " + str(total_2018_df['views'].sum()))
print("Total Views per video 2020: " + str(total_2020_df['views'].sum()))
print("Average Likes per video 2018: " + str(total_2018_df['likes'].mean()))
print("Average Likes per video 2020: " + str(total_2020_df['likes'].mean()))
print("Average Likes to Views ratio 2018: {}" .format(ratio_2018_series))
print("Average Likes to Views ratio 2020: {}" .format(ratio_2020_series))

### Average Likes to Views Ratio by Category by Year

In [None]:
ratio_2018_series = total_2018_df.groupby('cat_name').mean()['likes_to_views']
ratio_2020_series = total_2020_df.groupby('cat_name').mean()['likes_to_views']

# Get list of all unique categories
cats = set()
for cat in ratio_2018_series.keys():
    cats.add(cat)
for cat in ratio_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in ratio_2018_series:
        sum_2018 = ratio_2018_series[cat]
    else: 0
    
    if cat in ratio_2020_series:
        sum_2020 = ratio_2020_series[cat]
    else: 0
        
    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
title = 'Total Average Likes to Views Ratio by Category and Year'
ax.set_title(title)
ax.set_ylabel('Average Likes to Views Ratio')
ax.set_xlabel('Video Category')
ax.yaxis.grid(True)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.tight_layout()

file_title = title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

In [None]:
avg_cat_ratio_merged =  ratio_2018_series.to_frame().merge(ratio_2020_series, on='cat_name')
avg_cat_ratio_merged = avg_cat_ratio_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "likes_to_views_x": "Avg L-T-V Ratio 2018", "likes_to_views_y": "Avg L-T-V Ratio 2020"})
avg_cat_ratio_merged

### Save Data for Analysis

In [None]:
# categorize data by video category
cat_count_total_2018 = total_2018_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_total_2018['likes_to_views'] = ((cat_count_total_2018['likes'] / cat_count_total_2018['views'])*100)
cat_count_total_2018 = cat_count_total_2018.rename(columns={'category_id':'cat_count'})
# appears that YouTube removed Shows and Movies categories
# https://techpostplus.com/youtube-video-categories-list-faqs-and-solutions/
cat_count_total_2018 = cat_count_total_2018[(cat_count_total_2018['cat_name'] != 'Movies') & (cat_count_total_2018['cat_name'] != 'Shows')]
cat_count_total_2018 = cat_count_total_2018.sort_values(by='likes_to_views')

cat_count_total_2020 = total_2020_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_total_2020['likes_to_views'] = ((cat_count_total_2020['likes'] / cat_count_total_2020['views'])*100)
cat_count_total_2020 = cat_count_total_2020.rename(columns={'category_id':'cat_count'})
cat_count_total_2020 = cat_count_total_2020.sort_values(by='likes_to_views')

# comparing likes to views for each 
cat_count_total_merged = cat_count_total_2018.merge(cat_count_total_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))

# save df for statistics analysis
total_merged_likes_to_views = cat_count_total_merged[['cat_name', 'likes_to_views_2018', 'likes_to_views_2020']]
total_merged_likes_to_views.to_csv('./transformed-data/total_merged_likes_to_views.csv', index=False)

### Total Likes by Year

In [None]:
# comparing likes to views for each 
cat_count_total_merged = cat_count_total_2018.merge(cat_count_total_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))
cat_count_total_merged
print(cat_count_total_merged[['cat_name', 'likes_2018', 'likes_2020']])

# save df for statistics analysis
total_merged_likes = cat_count_total_merged[['cat_name', 'likes_2018', 'likes_2020']]
total_merged_likes.to_csv('./transformed-data/total_merged_likes.csv', index=False)

plt.figure(figsize=(15,8))

ypos = np.arange(len(cat_count_total_merged))
# Create bars at specified locations
plt.bar(ypos-0.2, cat_count_total_merged['likes_2018'], width=0.4, label='2018 Likes', color='red')
plt.bar(ypos+0.2, cat_count_total_merged['likes_2020'], width=0.4, label='2020 Likes', color='blue')

ax = plt.gca()
plt.xticks(ypos, cat_count_total_merged['cat_name'])
ax.set_xticklabels(cat_count_total_merged['cat_name'])
ax.yaxis.grid(True)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Video Category')    
plt.legend()
plt.ylabel('Likes')
title = "Total Likes by Category by Year"
plt.title(title)
plt.tight_layout()

file_title = title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

## Canada Data

In [None]:
# read in data
ca_2018_df = pd.read_csv('./transformed-data/2018_CA_data.csv')
ca_2020_df = pd.read_csv('./transformed-data/2020_CA_data.csv')

### Total Trending Videos in Canada

In [None]:
# Total trending videos
totals = [ca_2018_df.shape[0], ca_2020_df.shape[0]]  # number of rows of each year
print("Trending Video Count Canada 2018: " + str(ca_2018_df.shape[0]))
print("Trending Video Count Canada 2020: " + str(ca_2020_df.shape[0]))

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, totals, align='center')
ax.set_ylabel('Total trending videos')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Total Trending Videos Canada'
ax.set_title(title)
ax.yaxis.grid(True)
plt.tight_layout()

# Save plot to file
# file_title ='CA - ' + title + '.png'
# plt.savefig(save_dir + file_title, dpi=100)

### Total Trending Videos by Category by Year in Canada

In [None]:
# Total trending videos by category by year
cats_count_2018_series = ca_2018_df.groupby('cat_name').count()['title']
cats_count_2020_series = ca_2020_df.groupby('cat_name').count()['title']

# Get list of all unique categories
cats = set()
for cat in cats_count_2018_series.keys():
    cats.add(cat)
for cat in cats_count_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in cats_count_2018_series:
        sum_2018 = cats_count_2018_series[cat]
    else: 0
    
    if cat in cats_count_2020_series:
        sum_2020 = cats_count_2020_series[cat]
    else: 0

    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
ax.yaxis.grid(True)
ax.set_ylabel('Total Trending Videos')
ax.set_xlabel('Video Categories')
title = 'Total Trending Videos by Category and Year Canada'
ax.set_title(title)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()
plt.tight_layout()

file_title ='CA - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Number of Videos per Category

In [None]:
ca_cat_count_merged =  cats_count_2018_series.to_frame().merge(cats_count_2020_series, on='cat_name')
ca_cat_count_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "title_x": "Trending Videos 2018", "title_y": "Trending Videos 2018"})

### Average Likes to Views Ratio by Year

In [None]:
# Avg likes to view ratio
ratio_2018_series = ca_2018_df['likes_to_views'].mean()
ratio_2020_series = ca_2020_df['likes_to_views'].mean()
data = [ratio_2018_series, ratio_2020_series]

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, data, align='center')
ax.set_ylabel('Average Likes to Views Ratio')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Average Likes to Views Ratio by Year Canada'
ax.set_title(title)
ax.yaxis.grid(True)
# plt.show()
plt.tight_layout()

file_title ='CA - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Summary of Canada Data

In [None]:
print("Total Views per video 2018: " + str(ca_2018_df['views'].sum()))
print("Total Views per video 2020: " + str(ca_2020_df['views'].sum()))
print("Average Likes per video 2018: " + str(ca_2018_df['likes'].mean()))
print("Average Likes per video 2020: " + str(ca_2020_df['likes'].mean()))
print("Average Like to view ratio 2018: {}" .format(ratio_2018_series))
print("Average Like to view ratio 2020: {}" .format(ratio_2020_series))

### Average Likes to Views Ratio by Category by Year

In [None]:
# Avg likes to view ratio by category by year
ratio_2018_series = ca_2018_df.groupby('cat_name').mean()['likes_to_views']
ratio_2020_series = ca_2020_df.groupby('cat_name').mean()['likes_to_views']

# Get list of all unique categories
cats = set()
for cat in ratio_2018_series.keys():
    cats.add(cat)
for cat in ratio_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in ratio_2018_series:
        sum_2018 = ratio_2018_series[cat]
    else: 0
    
    if cat in ratio_2020_series:
        sum_2020 = ratio_2020_series[cat]
    else: 0
        
    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
title = 'Average Likes to Views Ratio by Category Canada'
ax.set_title(title)
ax.set_ylabel('Average Likes to Views Ratio')
ax.set_xlabel('Video Category')
ax.yaxis.grid(True)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()
plt.tight_layout()

file_title ='CA - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

In [None]:
avg_cat_ratio_merged =  ratio_2018_series.to_frame().merge(ratio_2020_series, on='cat_name')
avg_cat_ratio_merged = avg_cat_ratio_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "likes_to_views_x": "Avg L-T-V Ratio 2018", "likes_to_views_y": "Avg L-T-V Ratio 2020"})
avg_cat_ratio_merged

### Save Data for Analysis

In [None]:
# categorize data by video category
cat_count_ca_2018 = ca_2018_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_ca_2018['likes_to_views'] = ((cat_count_ca_2018['likes'] / cat_count_ca_2018['views'])*100)
cat_count_ca_2018 = cat_count_ca_2018.rename(columns={'category_id':'cat_count'})
# appears that YouTube removed Shows and Movies categories
# https://techpostplus.com/youtube-video-categories-list-faqs-and-solutions/
cat_count_ca_2018 = cat_count_ca_2018[(cat_count_ca_2018['cat_name'] != 'Movies') & (cat_count_ca_2018['cat_name'] != 'Shows')]
cat_count_ca_2018 = cat_count_ca_2018.sort_values(by='likes_to_views')

cat_count_ca_2020 = ca_2020_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_ca_2020['likes_to_views'] = ((cat_count_ca_2020['likes'] / cat_count_ca_2020['views'])*100)
cat_count_ca_2020 = cat_count_ca_2020.rename(columns={'category_id':'cat_count'})
cat_count_ca_2020 = cat_count_ca_2020.sort_values(by='likes_to_views')

# comparing likes to views for each 
cat_count_ca_merged = cat_count_ca_2018.merge(cat_count_ca_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))
cat_count_ca_merged

# save df for statistics analysis
ca_merged_likes_to_views = cat_count_ca_merged[['cat_name', 'likes_to_views_2018', 'likes_to_views_2020']]
ca_merged_likes_to_views.to_csv('./transformed-data/ca_merged_likes_to_views.csv', index=False)

### Total Likes by Year Canada

In [None]:
# comparing likes to views for each 
cat_count_ca_merged = cat_count_ca_2018.merge(cat_count_ca_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))
cat_count_ca_merged
print(cat_count_ca_merged[['cat_name', 'likes_2018', 'likes_2020']])

# save df for statistics analysis
ca_merged_likes = cat_count_ca_merged[['cat_name', 'likes_2018', 'likes_2020']]
ca_merged_likes.to_csv('./transformed-data/ca_merged_likes.csv', index=False)

plt.figure(figsize=(15,8))

ypos = np.arange(len(cat_count_ca_merged))
# Create bars at specified locations
plt.bar(ypos-0.2, cat_count_ca_merged['likes_2018'], width=0.4, label='2018 Likes', color='red')
plt.bar(ypos+0.2, cat_count_ca_merged['likes_2020'], width=0.4, label='2020 Likes', color='blue')

ax = plt.gca()
plt.xticks(ypos, cat_count_ca_merged['cat_name'])
ax.set_xticklabels(cat_count_ca_merged['cat_name'])
ax.yaxis.grid(True)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Video Category')    
plt.legend()
plt.ylabel('Likes')

title = "Likes by Category by Year Canada"
plt.title(title)
plt.tight_layout()

file_title ='CA - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

## Great Britain Data

In [None]:
# read in data
gb_2018_df = pd.read_csv('./transformed-data/2018_GB_data.csv')
gb_2020_df = pd.read_csv('./transformed-data/2020_GB_data.csv')

### Total Trending Videos in GB

In [None]:
# Total trending videos
totals = [gb_2018_df.shape[0], gb_2020_df.shape[0]]  # number of rows of each year
print("Trending Video Count Great Britain 2018: " + str(gb_2018_df.shape[0]))
print("Trending Video Count Great Britain 2020: " + str(gb_2020_df.shape[0]))

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, totals, align='center')
ax.set_ylabel('Total Trending Videos')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
ax.set_title('Total Trending Videos Great Britain')
ax.yaxis.grid(True)
plt.tight_layout()

# Save plot to file
# file_title = 'GB - ' + title + '.png'
# plt.savefig(save_dir + file_title, dpi=100)

### Total Trending Videos by Category by Year

In [None]:
cats_count_2018_series = gb_2018_df.groupby(['cat_name']).count()['title']
cats_count_2020_series = gb_2020_df.groupby(['cat_name']).count()['title']

# Get list of all unique categories
cats = set()
for cat in cats_count_2018_series.keys():
    cats.add(cat)
for cat in cats_count_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in cats_count_2018_series:
         sum_2018 = cats_count_2018_series[cat]
    else: 0
    
    if cat in cats_count_2020_series:
        sum_2020 = cats_count_2020_series[cat]
    else: 0

    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
ax.yaxis.grid(True)
ax.set_ylabel('Total trending videos')
ax.set_xlabel('Video Categories')
title = 'Total Trending Videos by Category and Year Great Britain'
ax.set_title(title)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.tight_layout()

file_title = 'GB - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Number of Videos by Category

In [None]:
gb_cat_count_merged =  cats_count_2018_series.to_frame().merge(cats_count_2020_series, on='cat_name')
gb_cat_count_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "title_x": "Trending Videos 2018", "title_y": "Trending Videos 2018"})

### Average Likes to Views Ratio by Year

In [None]:
# Avg likes to view ratio
ratio_2018_series = gb_2018_df['likes_to_views'].mean()
ratio_2020_series = gb_2020_df['likes_to_views'].mean()
data = [ratio_2018_series, ratio_2020_series]

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, data, align='center')
ax.set_ylabel('Average Likes to Views Ratio')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Average Likes to Views Ratio by Year Great Britain'
ax.set_title(title)
ax.yaxis.grid(True)
# plt.show()
plt.tight_layout()

file_title = 'GB - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Summary of Great Britain Data

In [None]:
# Summary of data for Great Britain
print("Total Views per video 2018: " + str(gb_2018_df['views'].sum()))
print("Total Views per video 2020: " + str(gb_2020_df['views'].sum()))
print("Average Likes per video 2018: " + str(gb_2018_df['likes'].mean()))
print("Average Likes per video 2020: " + str(gb_2020_df['likes'].mean()))
print("Like to views ratio 2018: {}" .format(ratio_2018_series))
print("Like to views ratio 2020: {}" .format(ratio_2020_series))

### Average Likes to Views Ratio by Categroy by Year

In [None]:
# Avg likes to view ratio by category by year
ratio_2018_series = gb_2018_df.groupby('cat_name').mean()['likes_to_views']
ratio_2020_series = gb_2020_df.groupby('cat_name').mean()['likes_to_views']

# Get list of all unique categories
cats = set()
for cat in ratio_2018_series.keys():
    cats.add(cat)
for cat in ratio_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in ratio_2018_series:
         sum_2018 = ratio_2018_series[cat]
    else: 0
    
    if cat in ratio_2020_series:
        sum_2020 = ratio_2020_series[cat]
    else: 0
        
    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
title = 'Average Likes to Views Ratio by Category Great Britain'
ax.set_title(title)
ax.set_ylabel('Average Likes to Views Ratio')
ax.set_xlabel('Video Category')
ax.yaxis.grid(True)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()
plt.tight_layout()

file_title = 'GB - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

In [None]:
avg_cat_ratio_merged = ratio_2018_series.to_frame().merge(ratio_2020_series, on='cat_name')
avg_cat_ratio_merged = avg_cat_ratio_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "likes_to_views_x": "Avg L-T-V Ratio 2018", "likes_to_views_y": "Avg L-T-V Ratio 2020"})
avg_cat_ratio_merged

### Save Data for Analysis

In [None]:
# categorize data by video category
cat_count_gb_2018 = gb_2018_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_gb_2018['likes_to_views'] = ((cat_count_gb_2018['likes'] / cat_count_gb_2018['views'])*100)
cat_count_gb_2018 = cat_count_gb_2018.rename(columns={'category_id':'cat_count'})
# appears that YouTube removed Shows and Movies categories
# https://techpostplus.com/youtube-video-categories-list-faqs-and-solutions/
cat_count_gb_2018 = cat_count_gb_2018[(cat_count_gb_2018['cat_name'] != 'Movies') & (cat_count_gb_2018['cat_name'] != 'Shows')]
cat_count_gb_2018 = cat_count_gb_2018.sort_values(by='likes_to_views')

cat_count_gb_2020 = gb_2020_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_gb_2020['likes_to_views'] = ((cat_count_gb_2020['likes'] / cat_count_gb_2020['views'])*100)
cat_count_gb_2020 = cat_count_gb_2020.rename(columns={'cate"gory_id':'cat_count'})
cat_count_gb_2020 = cat_count_gb_2020.sort_values(by='likes_to_views')

# comparing likes to views for each 
cat_count_gb_merged = cat_count_gb_2018.merge(cat_count_gb_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))

# save df for statistics analysis
gb_merged_likes_to_views = cat_count_gb_merged[['cat_name', 'likes_to_views_2018', 'likes_to_views_2020']]
gb_merged_likes_to_views.to_csv('./transformed-data/gb_merged_likes_to_views.csv', index=False)

### Great Britian Likes by Year

In [None]:
cat_count_gb_merged = cat_count_gb_2018.merge(cat_count_gb_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))
cat_count_gb_merged
print(cat_count_gb_merged[['cat_name', 'likes_2018', 'likes_2020']])

# save df for statistics analysis
gb_merged_likes = cat_count_gb_merged[['cat_name', 'likes_2018', 'likes_2020']]
gb_merged_likes.to_csv('./transformed-data/gb_merged_likes.csv', index=False)

plt.figure(figsize=(15,8))

ypos = np.arange(len(cat_count_gb_merged))
# Create bars at specified locations
plt.bar(ypos-0.2, cat_count_gb_merged['likes_2018'], width=0.4, label='2018 Likes', color='red')
plt.bar(ypos+0.2, cat_count_gb_merged['likes_2020'], width=0.4, label='2020 Likes', color='blue')

ax = plt.gca()
plt.xticks(ypos, cat_count_gb_merged['cat_name'])
ax.set_xticklabels(cat_count_gb_merged['cat_name'])
ax.yaxis.grid(True)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Video Category')    
plt.legend()
plt.ylabel('Likes')

title = "Likes by Category by Year Great Britain"
plt.title(title)
plt.tight_layout()

file_title ='GB - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### USA Data

In [None]:
# read in data
us_2018_df = pd.read_csv('./transformed-data/2018_US_data.csv')
us_2020_df = pd.read_csv('./transformed-data/2020_US_data.csv')

### Total Trending Videos

In [None]:
# Total trending videos
totals = [us_2018_df.shape[0], us_2020_df.shape[0]]  # number of rows of each year
print("Trending Video Count USA 2018: " + str(us_2018_df.shape[0]))
print("Trending Video Count USA 2020: " + str(us_2020_df.shape[0]))

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, totals, align='center')
ax.set_ylabel('Total Trending Videos')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Total Trending Videos USA'
ax.set_title(title)
ax.yaxis.grid(True)
plt.tight_layout()

# Save plot to file
# file_title = 'US - ' + title + '.png'
# plt.savefig(save_dir + file_title, dpi=100)

### Total Trending Videos by Category by Year

In [None]:
cats_count_2018_series = us_2018_df.groupby('cat_name').count()['title']
cats_count_2020_series = us_2020_df.groupby('cat_name').count()['title']

# Get list of all unique categories
cats = set()
for cat in cats_count_2018_series.keys():
    cats.add(cat)
for cat in cats_count_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in cats_count_2018_series:
         sum_2018 = cats_count_2018_series[cat]
    else: 0
    
    if cat in cats_count_2020_series:
        sum_2020 = cats_count_2020_series[cat]
    else: 0

    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
ax.yaxis.grid(True)
ax.set_ylabel('Total trending videos')
ax.set_xlabel('Video Categories')
title = 'Total Trending Videos by Category and Year USA'
ax.set_title(title)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()
plt.tight_layout()

file_title = 'US - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Number of Videos by Category

In [None]:
us_cat_count_merged =  cats_count_2018_series.to_frame().merge(cats_count_2020_series, on='cat_name')
us_cat_count_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "title_x": "Trending Videos 2018", "title_y": "Trending Videos 2018"})

### Average Likes to Views by Year

In [None]:
# Avg likes to view ratio
ratio_2018_series = us_2018_df['likes_to_views'].mean()
ratio_2020_series = us_2020_df['likes_to_views'].mean()
data = [ratio_2018_series, ratio_2020_series]

fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
years = ['2018', '2020']
x_pos = np.arange(len(years))
ax.bar(x_pos, data, align='center')
ax.set_ylabel('Average likes to view ratio')
ax.set_xlabel('Year')
ax.set_xticks(x_pos)
ax.set_xticklabels(years)
title = 'Average Likes to Views Ratio by Year USA'
ax.set_title(title)
ax.yaxis.grid(True)
# plt.show()
plt.tight_layout()

file_title = 'US - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

### Summary of USA data

In [None]:
print("Total Views per video 2018: " + str(us_2018_df['views'].sum()))
print("Total Views per video 2020: " + str(us_2020_df['views'].sum()))
print("Average Likes per video 2018: " + str(us_2018_df['likes'].mean()))
print("Average Likes per video 2020: " + str(us_2020_df['likes'].mean()))
print("Like to view ratio 2018: {}" .format(ratio_2018_series))
print("Like to view ratio 2020: {}" .format(ratio_2020_series))

### Average Likes to Views Ratio by Category by Year

In [None]:
# Avg likes to view ratio by category by year
ratio_2018_series = us_2018_df.groupby('cat_name').mean()['likes_to_views']
ratio_2020_series = us_2020_df.groupby('cat_name').mean()['likes_to_views']

# Get list of all unique categories
cats = set()
for cat in ratio_2018_series.keys():
    cats.add(cat)
for cat in ratio_2020_series.keys():
    cats.add(cat)
    
fig, ax = plt.subplots()
fig.patch.set_facecolor('#FFFFFF')
fig.set_size_inches(16, 8)
width = 3
count = 0
x_ticks = list()

for cat in cats:
    if cat in ratio_2018_series:
        sum_2018 = ratio_2018_series[cat]
    else: 0
    
    if cat in ratio_2020_series:
        sum_2020 = ratio_2020_series[cat]
    else: 0
        
    ax.bar(count-width/2, sum_2018, width=width, color='r', align='center')
    ax.bar(count+width/2, sum_2020, width=width, color='b', align='center')
    x_ticks.append(count)
    count += width * 3 + 1
    
ax.set_xticks(x_ticks)
ax.set_xticklabels(cats)
ax.legend(['2018', '2020'])
title = 'Average Likes to Views Ratio by Category USA'
ax.set_title(title)
ax.set_ylabel('Average Likes to Views Ratio')
ax.set_xlabel('Video Category')
ax.yaxis.grid(True)
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()
plt.tight_layout()

file_title = 'US - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)

In [None]:
avg_cat_ratio_merged =  ratio_2018_series.to_frame().merge(ratio_2020_series, on='cat_name')
avg_cat_ratio_merged = avg_cat_ratio_merged.sort_values(by='cat_name').reset_index().rename(columns={"cat_name": "Category", "likes_to_views_x": "Avg L-T-V Ratio 2018", "likes_to_views_y": "Avg L-T-V Ratio 2020"})
avg_cat_ratio_merged

### Save Data for Analysis

In [None]:
# categorize data by video category
cat_count_us_2018 = us_2018_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_us_2018['likes_to_views'] = ((cat_count_us_2018['likes'] / cat_count_us_2018['views'])*100)
cat_count_us_2018 = cat_count_us_2018.rename(columns={'category_id':'cat_count'})
# appears that YouTube removed Shows and Movies categories
# https://techpostplus.com/youtube-video-categories-list-faqs-and-solutions/
cat_count_us_2018 = cat_count_us_2018[(cat_count_us_2018['cat_name'] != 'Movies') & (cat_count_us_2018['cat_name'] != 'Shows')]
cat_count_us_2018 = cat_count_us_2018.sort_values(by='likes_to_views')

cat_count_us_2020 = us_2020_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_us_2020['likes_to_views'] = ((cat_count_us_2020['likes'] / cat_count_us_2020['views'])*100)
cat_count_us_2020 = cat_count_us_2020.rename(columns={'cate"gory_id':'cat_count'})
cat_count_us_2020 = cat_count_us_2020.sort_values(by='likes_to_views')

# comparing likes to views for each 
cat_count_us_merged = cat_count_us_2018.merge(cat_count_us_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))

# save df for statistics analysis
us_merged_likes_to_views = cat_count_us_merged[['cat_name', 'likes_to_views_2018', 'likes_to_views_2020']]
us_merged_likes_to_views.to_csv('./transformed-data/us_merged_likes_to_views.csv', index=False)

### USA Likes

In [None]:
# comparing likes to views for each 
cat_count_us_merged = cat_count_us_2018.merge(cat_count_us_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))
cat_count_us_merged
print(cat_count_us_merged[['cat_name', 'likes_2018', 'likes_2020']])

# save df for statistics analysis
us_merged_likes = cat_count_us_merged[['cat_name', 'likes_2018', 'likes_2020']]
us_merged_likes.to_csv('./transformed-data/us_merged_likes.csv', index=False)

plt.figure(figsize=(15,8))

ypos = np.arange(len(cat_count_us_merged))
# Create bars at specified locations
plt.bar(ypos-0.2, cat_count_us_merged['likes_2018'], width=0.4, label='2018 Likes', color='red')
plt.bar(ypos+0.2, cat_count_us_merged['likes_2020'], width=0.4, label='2020 Likes', color='blue')

ax = plt.gca()
plt.xticks(ypos, cat_count_us_merged['cat_name'])
ax.set_xticklabels(cat_count_us_merged['cat_name'])
ax.yaxis.grid(True)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Video Category')    
plt.legend()
plt.ylabel('Likes')

title = "Likes by Category by Year USA"
plt.title(title)
plt.tight_layout()

file_title ='US - ' + title + '.png'
plt.savefig(save_dir + file_title, dpi=100)