# Statistical Analysis

In [None]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
# Set to see all rows (https://piazza.com/class/keelp7u2rd65ms?cid=29)
# pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
# load data files
ca_data_2018 = pd.read_csv('./transformed-data/2018_CA_data.csv')
ca_data_2020 = pd.read_csv('./transformed-data/2020_CA_data.csv')
gb_data_2018 = pd.read_csv('./transformed-data/2018_GB_data.csv')
gb_data_2020 = pd.read_csv('./transformed-data/2020_GB_data.csv')
us_data_2018 = pd.read_csv('./transformed-data/2018_US_data.csv')
us_data_2020 = pd.read_csv('./transformed-data/2020_US_data.csv')

# Directory to save plots
save_dir = './Trending Video Likes to Views Ratio Analysis PLOTS/'

## Anova on Likes to Views Canada

In [None]:
# Filter for L-T-V Ratio
ca_data_2018_filter = ca_data_2018[['cat_name', 'likes_to_views']]
ca_data_2018_filter = ca_data_2018_filter[ca_data_2018_filter['likes_to_views'] > 0]
# ca_data_2018_filter['cat_name'] = ca_data_2018_filter[ca_data_2018_filter['cat_name'] != 'Shows']
ca_data_2018_filter['cat_name'] = ca_data_2018_filter['cat_name'].astype(str) + '_2018'
# ca_data_2018_filter

ca_data_2020_filter = ca_data_2020[['cat_name', 'likes_to_views']]
ca_data_2020_filter = ca_data_2020_filter[ca_data_2020_filter['likes_to_views'] > 0]
ca_data_2020_filter['cat_name'] = ca_data_2020_filter['cat_name'].astype(str) + '_2020'
# ca_data_2020_filter

In [None]:
# Transform the data
ca_data_2018_filter['likes_to_views'] = np.sqrt(ca_data_2018_filter['likes_to_views'])
ca_data_2020_filter['likes_to_views'] = np.sqrt(ca_data_2020_filter['likes_to_views'])

In [None]:
# L-T-V ratios by Category 2018
x1 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Film & Animation_2018']['likes_to_views']
x2 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Music_2018']['likes_to_views']
x3 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'News & Politics_2018']['likes_to_views']
x4 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Pets & Animals_2018']['likes_to_views']
x5 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Entertainment_2018']['likes_to_views']
x6 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Education_2018']['likes_to_views']
x7 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Howto & Style_2018']['likes_to_views']
x8 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'People & Blogs_2018']['likes_to_views']
x9 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Comedy_2018']['likes_to_views']
x10 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Sports_2018']['likes_to_views']
x11 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Science & Technology_2018']['likes_to_views']
x12 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Travel & Events_2018']['likes_to_views']
x13 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Autos & Vehicles_2018']['likes_to_views']
x14 = ca_data_2018_filter[ca_data_2018_filter['cat_name'] == 'Gaming_2018']['likes_to_views']

# L-T-V ratios by Category 2020
y1 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Film & Animation_2020']['likes_to_views']
y2 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Music_2020']['likes_to_views']
y3 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'News & Politics_2020']['likes_to_views']
y4 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Pets & Animals_2020']['likes_to_views']
y5 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Entertainment_2020']['likes_to_views']
y6 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Education_2020']['likes_to_views']
y7 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Howto & Style_2020']['likes_to_views']
y8 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'People & Blogs_2020']['likes_to_views']
y9 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Comedy_2020']['likes_to_views']
y10 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Sports_2020']['likes_to_views']
y11 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Science & Technology_2020']['likes_to_views']
y12 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Travel & Events_2020']['likes_to_views']
y13 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Autos & Vehicles_2020']['likes_to_views']
y14 = ca_data_2020_filter[ca_data_2020_filter['cat_name'] == 'Gaming_2020']['likes_to_views']

In [None]:
# Histogram of transformed CA 2018 data
bars = 20
plt.figure(figsize=(8, 6))

plt.hist(x1, bars, alpha=0.5, label='Film & Animation_2018')
plt.hist(x2, bars, alpha=0.5, label='Music_2018')
plt.hist(x3, bars, alpha=0.5, label='News & Politics_2018')
plt.hist(x4, bars, alpha=0.5, label='Pets & Animals_2018')
plt.hist(x5, bars, alpha=0.5, label='Entertainment_2018')
plt.hist(x6, bars, alpha=0.5, label='Education_2018')
plt.hist(x7, bars, alpha=0.5, label='Howto & Style_2018')
plt.hist(x8, bars, alpha=0.5, label='People & Blogs_2018')
plt.hist(x9, bars, alpha=0.5, label='Comedy_2018')
plt.hist(x10, bars, alpha=0.5, label='Sports_2018')
plt.hist(x11, bars, alpha=0.5, label='Science & Technology_2018')
plt.hist(x12, bars, alpha=0.5, label='Travel & Events_2018')
plt.hist(x13, bars, alpha=0.5, label='Autos & Vehicles_2018')
plt.hist(x14, bars, alpha=0.5, label='Gaming_2018')

title = 'CA - L-T-V Ratio 2018 Transformed'
plt.title(title)
plt.xlabel('L-T-V Ratio')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()

# Save plot to file
file_title = title + '.png'
plt.savefig(save_dir + file_title)

In [None]:
# Histogram of transformed CA 2020 data
bars = 20
plt.figure(figsize=(8, 6))

plt.hist(y1, bars, alpha=0.5, label='Film & Animation_2018')
plt.hist(y2, bars, alpha=0.5, label='Music_2018')
plt.hist(y3, bars, alpha=0.5, label='News & Politics_2018')
plt.hist(y4, bars, alpha=0.5, label='Pets & Animals_2018')
plt.hist(y5, bars, alpha=0.5, label='Entertainment_2018')
plt.hist(y6, bars, alpha=0.5, label='Education_2018')
plt.hist(y7, bars, alpha=0.5, label='Howto & Style_2018')
plt.hist(y8, bars, alpha=0.5, label='People & Blogs_2018')
plt.hist(y9, bars, alpha=0.5, label='Comedy_2018')
plt.hist(y10, bars, alpha=0.5, label='Sports_2018')
plt.hist(y11, bars, alpha=0.5, label='Science & Technology_2018')
plt.hist(y12, bars, alpha=0.5, label='Travel & Events_2018')
plt.hist(y13, bars, alpha=0.5, label='Autos & Vehicles_2018')
plt.hist(y14, bars, alpha=0.5, label='Gaming_2018')

title = 'CA - L-T-V Ratio 2020 Transformed'
plt.title(title)
plt.xlabel('L-T-V Ratio')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()

# Save plot to file
file_title = title + '.png'
plt.savefig(save_dir + file_title)

In [None]:
# Perform ANOVA Test
anova_results = stats.f_oneway(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
                               y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14)

print('\n-> ANOVA p-value for Views of trending Views per each Year, Category pair\n')
print(anova_results)
print('P-value: {0:.15f}'.format(anova_results.pvalue))

## Post Hoc Tukey's Test on Likes to Views Canada

In [None]:
# Append 2018 and 2020 data together
ca_data_years_joined = ca_data_2018_filter.append(ca_data_2020_filter, ignore_index=True)
# ca_data_years_joined

In [None]:
# Perform Tukey's Analysis per category
posthoc = pairwise_tukeyhsd(ca_data_years_joined['likes_to_views'], ca_data_years_joined['cat_name'], alpha=0.05)

print('PostHoc Analysis for L-T-V ratio Per Categories for 2018 vs 2020:\n')
print(posthoc)

In [None]:
# Save Tukey's data
fig = posthoc.plot_simultaneous()    
fig.set_size_inches(20, 20)
fig.patch.set_facecolor('#FFFFFF')
fig.subplots_adjust(top=0.95)
title = 'CA - Tukeys Test for L-T-V Ratio per Categories of 2018 and 2020'
plt.suptitle(title, size=16)
plt.ylabel('Categories')
plt.xlabel('L-T-V Ratio')

# Save plot to file
file_title = title + '.png'
fig.savefig(save_dir + file_title)

## Anova on Likes to Views Great Britain

In [None]:
# Filter for L-T-V Ratio
gb_data_2018_filter = gb_data_2018[['cat_name', 'likes_to_views']]
gb_data_2018_filter = gb_data_2018_filter[gb_data_2018_filter['likes_to_views'] > 0]
gb_data_2018_filter['cat_name'] = gb_data_2018_filter[gb_data_2018_filter['cat_name'] != 'Shows']
gb_data_2018_filter['cat_name'] = gb_data_2018_filter['cat_name'].astype(str) + '_2018'

gb_data_2020_filter = gb_data_2020[['cat_name', 'likes_to_views']]
gb_data_2020_filter = gb_data_2020_filter[gb_data_2020_filter['likes_to_views'] > 0]
gb_data_2020_filter['cat_name'] = gb_data_2020_filter['cat_name'].astype(str) + '_2020'

In [None]:
# Transform the data
gb_data_2018_filter['likes_to_views'] = np.sqrt(gb_data_2018_filter['likes_to_views'])
gb_data_2020_filter['likes_to_views'] = np.sqrt(gb_data_2020_filter['likes_to_views'])

In [None]:
# L-T-V ratios by Category 2018
x1 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Film & Animation_2018']['likes_to_views']
x2 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Music_2018']['likes_to_views']
x3 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'News & Politics_2018']['likes_to_views']
x4 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Pets & Animals_2018']['likes_to_views']
x5 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Entertainment_2018']['likes_to_views']
x6 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Education_2018']['likes_to_views']
x7 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Howto & Style_2018']['likes_to_views']
x8 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'People & Blogs_2018']['likes_to_views']
x9 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Comedy_2018']['likes_to_views']
x10 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Sports_2018']['likes_to_views']
x11 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Science & Technology_2018']['likes_to_views']
x12 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Travel & Events_2018']['likes_to_views']
x13 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Autos & Vehicles_2018']['likes_to_views']
x14 = gb_data_2018_filter[gb_data_2018_filter['cat_name'] == 'Gaming_2018']['likes_to_views']

# L-T-V ratios by Category 2020
y1 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Film & Animation_2020']['likes_to_views']
y2 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Music_2020']['likes_to_views']
y3 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'News & Politics_2020']['likes_to_views']
y4 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Pets & Animals_2020']['likes_to_views']
y5 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Entertainment_2020']['likes_to_views']
y6 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Education_2020']['likes_to_views']
y7 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Howto & Style_2020']['likes_to_views']
y8 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'People & Blogs_2020']['likes_to_views']
y9 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Comedy_2020']['likes_to_views']
y10 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Sports_2020']['likes_to_views']
y11 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Science & Technology_2020']['likes_to_views']
y12 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Travel & Events_2020']['likes_to_views']
y13 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Autos & Vehicles_2020']['likes_to_views']
y14 = gb_data_2020_filter[gb_data_2020_filter['cat_name'] == 'Gaming_2020']['likes_to_views']

In [None]:
# Histogram of transformed GB 2018 data
bars = 20
plt.figure(figsize=(8, 6))

plt.hist(x1, bars, alpha=0.5, label='Film & Animation_2018')
plt.hist(x2, bars, alpha=0.5, label='Music_2018')
plt.hist(x3, bars, alpha=0.5, label='News & Politics_2018')
plt.hist(x4, bars, alpha=0.5, label='Pets & Animals_2018')
plt.hist(x5, bars, alpha=0.5, label='Entertainment_2018')
plt.hist(x6, bars, alpha=0.5, label='Education_2018')
plt.hist(x7, bars, alpha=0.5, label='Howto & Style_2018')
plt.hist(x8, bars, alpha=0.5, label='People & Blogs_2018')
plt.hist(x9, bars, alpha=0.5, label='Comedy_2018')
plt.hist(x10, bars, alpha=0.5, label='Sports_2018')
plt.hist(x11, bars, alpha=0.5, label='Science & Technology_2018')
plt.hist(x12, bars, alpha=0.5, label='Travel & Events_2018')
plt.hist(x13, bars, alpha=0.5, label='Autos & Vehicles_2018')
plt.hist(x14, bars, alpha=0.5, label='Gaming_2018')

title='GB - L-T-V Ratio 2018 Transformed'
plt.title(title)
plt.xlabel('L-T-V Ratio')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()

# Save plot to file
file_title = title + '.png'
plt.savefig(save_dir + file_title)

In [None]:
# Histogram of transformed GB 2020 data
bars = 20
plt.figure(figsize=(8, 6))

plt.hist(y1, bars, alpha=0.5, label='Film & Animation_2018')
plt.hist(y2, bars, alpha=0.5, label='Music_2018')
plt.hist(y3, bars, alpha=0.5, label='News & Politics_2018')
plt.hist(y4, bars, alpha=0.5, label='Pets & Animals_2018')
plt.hist(y5, bars, alpha=0.5, label='Entertainment_2018')
plt.hist(y6, bars, alpha=0.5, label='Education_2018')
plt.hist(y7, bars, alpha=0.5, label='Howto & Style_2018')
plt.hist(y8, bars, alpha=0.5, label='People & Blogs_2018')
plt.hist(y9, bars, alpha=0.5, label='Comedy_2018')
plt.hist(y10, bars, alpha=0.5, label='Sports_2018')
plt.hist(y11, bars, alpha=0.5, label='Science & Technology_2018')
plt.hist(y12, bars, alpha=0.5, label='Travel & Events_2018')
plt.hist(y13, bars, alpha=0.5, label='Autos & Vehicles_2018')
plt.hist(y14, bars, alpha=0.5, label='Gaming_2018')

title = 'GB - L-T-V Ratio 2020 Transformed'
plt.title(title)
plt.xlabel('L-T-V Ratio')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()

# Save plot to file
file_title = title + '.png'
plt.savefig(save_dir + file_title)

In [None]:
# Perform ANOVA Test
anova_results = stats.f_oneway(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
                               y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14)

print('\n-> ANOVA p-value for Views of trending Views per each Year, Category pair\n')
print(anova_results)
print('P-value: {0:.15f}'.format(anova_results.pvalue))

## Post Hoc Tukey's Test on Likes to Views Great Britain

In [None]:
# Append 2018 and 2020 data together
gb_data_years_joined = gb_data_2018_filter.append(gb_data_2020_filter, ignore_index=True)
# gb_data_years_joined

In [None]:
# Perform Tukey's Analysis per category
posthoc = pairwise_tukeyhsd(gb_data_years_joined['likes_to_views'], gb_data_years_joined['cat_name'], alpha=0.05)

print('PostHoc Analysis for L-T-V ratio Per Categories for 2018 vs 2020:\n')
print(posthoc)

In [None]:
# Save Tukey's data
fig = posthoc.plot_simultaneous()    
fig.set_size_inches(20, 20)
fig.patch.set_facecolor('#FFFFFF')
fig.subplots_adjust(top=0.95)
title = 'GB - Tukeys Test for L-T-V Ratio per Categories of 2018 and 2020'
plt.suptitle(title, size=16)
plt.ylabel('Categories')
plt.xlabel('L-T-V Ratio')

# Save plot to file
file_title = title + '.png'
fig.savefig(save_dir + file_title)

## Anova on Likes to Views USA

In [None]:
# Filter for L-T-V Ratio
us_data_2018_filter = us_data_2018[['cat_name', 'likes_to_views']]
us_data_2018_filter = us_data_2018_filter[us_data_2018_filter['likes_to_views'] > 0]
us_data_2018_filter['cat_name'] = us_data_2018_filter[us_data_2018_filter['cat_name'] != 'Shows']
us_data_2018_filter['cat_name'] = us_data_2018_filter['cat_name'].astype(str) + '_2018'
# ca_data_2018_filter

us_data_2020_filter = us_data_2020[['cat_name', 'likes_to_views']]
us_data_2020_filter = us_data_2020_filter[us_data_2020_filter['likes_to_views'] > 0]
us_data_2020_filter['cat_name'] = us_data_2020_filter['cat_name'].astype(str) + '_2020'
# us_data_2020_filter

In [None]:
# Transform the data
us_data_2018_filter['likes_to_views'] = np.sqrt(us_data_2018_filter['likes_to_views'])
us_data_2020_filter['likes_to_views'] = np.sqrt(us_data_2020_filter['likes_to_views'])

In [None]:
# L-T-V ratios by Category 2018
x1 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Film & Animation_2018']['likes_to_views']
x2 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Music_2018']['likes_to_views']
x3 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'News & Politics_2018']['likes_to_views']
x4 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Pets & Animals_2018']['likes_to_views']
x5 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Entertainment_2018']['likes_to_views']
x6 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Education_2018']['likes_to_views']
x7 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Howto & Style_2018']['likes_to_views']
x8 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'People & Blogs_2018']['likes_to_views']
x9 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Comedy_2018']['likes_to_views']
x10 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Sports_2018']['likes_to_views']
x11 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Science & Technology_2018']['likes_to_views']
x12 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Travel & Events_2018']['likes_to_views']
x13 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Autos & Vehicles_2018']['likes_to_views']
x14 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Gaming_2018']['likes_to_views']
x15 = us_data_2018_filter[us_data_2018_filter['cat_name'] == 'Nonprofits & Activism_2018']['likes_to_views']

# L-T-V ratios by Category 2020
y1 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Film & Animation_2020']['likes_to_views']
y2 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Music_2020']['likes_to_views']
y3 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'News & Politics_2020']['likes_to_views']
y4 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Pets & Animals_2020']['likes_to_views']
y5 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Entertainment_2020']['likes_to_views']
y6 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Education_2020']['likes_to_views']
y7 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Howto & Style_2020']['likes_to_views']
y8 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'People & Blogs_2020']['likes_to_views']
y9 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Comedy_2020']['likes_to_views']
y10 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Sports_2020']['likes_to_views']
y11 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Science & Technology_2020']['likes_to_views']
y12 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Travel & Events_2020']['likes_to_views']
y13 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Autos & Vehicles_2020']['likes_to_views']
y14 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Gaming_2020']['likes_to_views']
y15 = us_data_2020_filter[us_data_2020_filter['cat_name'] == 'Nonprofits & Activism_2020']['likes_to_views']

In [None]:
# Histogram of transformed GB 2018 data
bars = 20
plt.figure(figsize=(8, 6))

plt.hist(x1, bars, alpha=0.5, label='Film & Animation_2018')
plt.hist(x2, bars, alpha=0.5, label='Music_2018')
plt.hist(x3, bars, alpha=0.5, label='News & Politics_2018')
plt.hist(x4, bars, alpha=0.5, label='Pets & Animals_2018')
plt.hist(x5, bars, alpha=0.5, label='Entertainment_2018')
plt.hist(x6, bars, alpha=0.5, label='Education_2018')
plt.hist(x7, bars, alpha=0.5, label='Howto & Style_2018')
plt.hist(x8, bars, alpha=0.5, label='People & Blogs_2018')
plt.hist(x9, bars, alpha=0.5, label='Comedy_2018')
plt.hist(x10, bars, alpha=0.5, label='Sports_2018')
plt.hist(x11, bars, alpha=0.5, label='Science & Technology_2018')
plt.hist(x12, bars, alpha=0.5, label='Travel & Events_2018')
plt.hist(x13, bars, alpha=0.5, label='Autos & Vehicles_2018')
plt.hist(x14, bars, alpha=0.5, label='Gaming_2018')
plt.hist(x15, bars, alpha=0.5, label='Nonprofits & Activism_2020')

title = 'US - L-T-V Ratio 2018 Transformed'
plt.title(title)
plt.xlabel('L-T-V Ratio')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()

# Save plot to file
file_title = title + '.png'
plt.savefig(save_dir + file_title)

In [None]:
# Histogram of transformed GB 2020 data
bars = 20
plt.figure(figsize=(8, 6))

plt.hist(y1, bars, alpha=0.5, label='Film & Animation_2018')
plt.hist(y2, bars, alpha=0.5, label='Music_2018')
plt.hist(y3, bars, alpha=0.5, label='News & Politics_2018')
plt.hist(y4, bars, alpha=0.5, label='Pets & Animals_2018')
plt.hist(y5, bars, alpha=0.5, label='Entertainment_2018')
plt.hist(y6, bars, alpha=0.5, label='Education_2018')
plt.hist(y7, bars, alpha=0.5, label='Howto & Style_2018')
plt.hist(y8, bars, alpha=0.5, label='People & Blogs_2018')
plt.hist(y9, bars, alpha=0.5, label='Comedy_2018')
plt.hist(y10, bars, alpha=0.5, label='Sports_2018')
plt.hist(y11, bars, alpha=0.5, label='Science & Technology_2018')
plt.hist(y12, bars, alpha=0.5, label='Travel & Events_2018')
plt.hist(y13, bars, alpha=0.5, label='Autos & Vehicles_2018')
plt.hist(y14, bars, alpha=0.5, label='Gaming_2018')
plt.hist(y15, bars, alpha=0.5, label='Nonprofits & Activism_2020')

title = 'US - L-T-V Ratio 2020 Transformed'
plt.title(title)
plt.xlabel('L-T-V Ratio')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()

# Save plot to file
file_title = title + '.png'
plt.savefig(save_dir + file_title)

In [None]:
# Perform ANOVA Test
anova_results = stats.f_oneway(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
                               y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15)

print('\n-> ANOVA p-value for Views of trending Views per each Year, Category pair\n')
print(anova_results)
print('P-value: {0:.15f}'.format(anova_results.pvalue))

## Post Hoc Tukey's Test on Likes to Views USA

In [None]:
# Append 2018 and 2020 data together
us_data_years_joined = us_data_2018_filter.append(us_data_2020_filter, ignore_index=True)
# us_data_years_joined

In [None]:
# Perform Tukey's Analysis per category
posthoc = pairwise_tukeyhsd(us_data_years_joined['likes_to_views'], us_data_years_joined['cat_name'], alpha=0.05)

print('PostHoc Analysis for L-T-V ratio Per Categories for 2018 vs 2020:\n')
print(posthoc)

In [None]:
# Save Tukey's data
fig = posthoc.plot_simultaneous()    
fig.set_size_inches(20, 20)
fig.patch.set_facecolor('#FFFFFF')
fig.subplots_adjust(top=0.95)
title = 'US - Tukeys Test for L-T-V Ratio per Categories of 2018 and 2020'
plt.suptitle(title, size=16)
plt.ylabel('Categories')
plt.xlabel('L-T-V Ratio')

# Save plot to file
file_title = title + '.png'
fig.savefig(save_dir + file_title)

## Chi-Squared Statistical Test on Likes Data
Chi-square test of independence of variables in a contingency table.

### Total Likes by Year

In [None]:
# load data files
total_merged_likes = pd.read_csv('./transformed-data/total_merged_likes.csv')
ca_merged_likes = pd.read_csv('./transformed-data/ca_merged_likes.csv')
gb_merged_likes = pd.read_csv('./transformed-data/gb_merged_likes.csv')
us_merged_likes = pd.read_csv('./transformed-data/us_merged_likes.csv')

In [None]:
likes_cats_total = total_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_total

In [None]:
# chi-squared for all categories between 2018 and 2020 USA
col1 = likes_cats_total['likes_2018'].tolist()
col2 = likes_cats_total['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

### Canada Likes by Year

In [None]:
likes_cats_ca = ca_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_ca

In [None]:
# chi-squared for all categories between 2018 and 2020 Great Britain
col1 = likes_cats_ca['likes_2018'].tolist()
col2 = likes_cats_ca['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

### Great Britain Likes by Year

In [None]:
likes_cats_gb = gb_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_gb

In [None]:
# chi-squared for all categories between 2018 and 2020 Great Britain
col1 = likes_cats_gb['likes_2018'].tolist()
col2 = likes_cats_gb['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)
print(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

### USA Likes by Year

In [None]:
likes_cats_us = us_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_us

In [None]:
# chi-squared for all categories between 2018 and 2020 USA
col1 = likes_cats_us['likes_2018'].tolist()
col2 = likes_cats_us['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))