# Statistical Analysis

In [17]:
# libraries
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

## Chi-Squared Statistical Test on Likes to Views Data
Chi-square test of independence of variables in a contingency table.


In [18]:
# load data files
total_merged_likes_to_views = pd.read_csv('./transformed-data/total_merged_likes_to_views.csv')
ca_merged_likes_to_views = pd.read_csv('./transformed-data/ca_merged_likes_to_views.csv')
gb_merged_likes_to_views = pd.read_csv('./transformed-data/gb_merged_likes_to_views.csv')
us_merged_likes_to_views = pd.read_csv('./transformed-data/us_merged_likes_to_views.csv')

### Total Likes to Views by Year

In [19]:
likes_to_views_cats_yearly = total_merged_likes_to_views.sort_values(by='cat_name').reset_index(drop=True)
likes_to_views_cats_yearly

Unnamed: 0,cat_name,likes_to_views_2018,likes_to_views_2020
0,Autos & Vehicles,0.946085,5.427101
1,Comedy,4.22331,9.078692
2,Education,3.891539,7.132108
3,Entertainment,2.567791,6.572832
4,Film & Animation,2.173623,5.291065
5,Gaming,4.165293,4.927249
6,Howto & Style,4.209038,6.166092
7,Music,2.394883,7.539095
8,News & Politics,2.2439,1.582537
9,Nonprofits & Activism,8.862345,4.146673


In [20]:
# chi-squared for all categories between 2018 and 2020
col1 = likes_to_views_cats_yearly['likes_to_views_2018'].tolist()
col2 = likes_to_views_cats_yearly['likes_to_views_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t' + str(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.7368277723908396
test statistic: 	10.34131793115176
degrees of freedom: 	14
expected: 
[[2.36004829 4.92585178 4.082156   3.38485542 2.7642411  3.36705058
  3.84200413 3.67864187 1.41696388 4.81735688 3.30074608 2.97159329
  2.2638123  1.78604053 2.19310433]
 [4.01313755 8.37615097 6.94149081 5.75576799 4.70044609 5.72549179
  6.53312524 6.25533634 2.40947229 8.19166111 5.61274449 5.05303754
  3.84949332 3.03706766 3.72925816]]


### Canada Likes to Views Data

In [21]:
likes_to_views_cats_ca = ca_merged_likes_to_views.sort_values(by='cat_name').reset_index(drop=True)
likes_to_views_cats_ca

Unnamed: 0,cat_name,likes_to_views_2018,likes_to_views_2020
0,Autos & Vehicles,1.993554,5.251247
1,Comedy,5.102046,9.529652
2,Education,3.966004,7.010182
3,Entertainment,2.708456,6.905154
4,Film & Animation,2.740353,5.36674
5,Gaming,4.484776,4.999523
6,Howto & Style,3.321989,6.121261
7,Music,4.211576,7.507792
8,News & Politics,2.188774,1.678835
9,People & Blogs,2.851173,6.532402


In [22]:
# chi-squared for all categories between 2018 and 2020 Canada
col1 = likes_to_views_cats_ca['likes_to_views_2018'].tolist()
col2 = likes_to_views_cats_ca['likes_to_views_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t' + str(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.9991913587616372
test statistic: 	2.5165029481482035
degrees of freedom: 	13
expected: 
[[2.65828599 5.36871001 4.02741773 3.52745715 2.97468126 3.48001022
  3.46494784 4.30010872 1.41911573 3.44305186 3.28919537 2.61198748
  1.67853758 2.85114493]
 [4.58651507 9.26298733 6.94876782 6.08615306 5.13241259 6.00428977
  5.97830165 7.41925946 2.4484934  5.94052309 5.67506439 4.50663324
  2.89609093 4.91926726]]


### Great Britain Likes to Views Data

In [23]:
likes_to_views_cats_gb = gb_merged_likes_to_views.sort_values(by='cat_name').reset_index(drop=True)
likes_to_views_cats_gb

Unnamed: 0,cat_name,likes_to_views_2018,likes_to_views_2020
0,Autos & Vehicles,1.157833,5.606231
1,Comedy,3.612824,8.950285
2,Education,4.121799,7.479631
3,Entertainment,2.502034,6.550022
4,Film & Animation,1.872874,5.243819
5,Gaming,4.288928,5.042603
6,Howto & Style,4.89423,6.313202
7,Music,2.064774,8.059393
8,News & Politics,3.285271,1.536162
9,People & Blogs,1.920561,6.379379


In [24]:
# chi-squared for all categories between 2018 and 2020 Great Britain
col1 = likes_to_views_cats_gb['likes_to_views_2018'].tolist()
col2 = likes_to_views_cats_gb['likes_to_views_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t' + str(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.88732015648416
test statistic: 	7.2769584934610085
degrees of freedom: 	13
expected: 
[[2.22436499 4.13138292 3.81513466 2.97677173 2.3403269  3.06867737
  3.68556818 3.32933594 1.58553002 2.72943825 2.62050475 1.80022144
  1.69809907 1.3068187 ]
 [4.53969945 8.4317263  7.78629623 6.07528399 4.77636575 6.26285392
  7.5218644  6.79483116 3.23590319 5.57050187 5.34817983 3.67406622
  3.46564501 2.66708214]]


### USA Likes to Views Data

In [25]:
likes_to_views_cats_us = us_merged_likes_to_views.sort_values(by='cat_name').reset_index(drop=True)
likes_to_views_cats_us

Unnamed: 0,cat_name,likes_to_views_2018,likes_to_views_2020
0,Autos & Vehicles,0.559971,5.413169
1,Comedy,4.090107,8.661528
2,Education,3.734761,6.948761
3,Entertainment,2.576842,6.278847
4,Film & Animation,2.38942,5.256809
5,Gaming,3.65201,4.785119
6,Howto & Style,4.247042,6.115934
7,Music,3.905155,7.220889
8,News & Politics,1.344845,1.520568
9,Nonprofits & Activism,8.862345,4.146673


In [26]:
# chi-squared for all categories between 2018 and 2020 USA
col1 = likes_to_views_cats_yearly['likes_to_views_2018'].tolist()
col2 = likes_to_views_cats_yearly['likes_to_views_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t' + str(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.7368277723908396
test statistic: 	10.34131793115176
degrees of freedom: 	14
expected: 
[[2.36004829 4.92585178 4.082156   3.38485542 2.7642411  3.36705058
  3.84200413 3.67864187 1.41696388 4.81735688 3.30074608 2.97159329
  2.2638123  1.78604053 2.19310433]
 [4.01313755 8.37615097 6.94149081 5.75576799 4.70044609 5.72549179
  6.53312524 6.25533634 2.40947229 8.19166111 5.61274449 5.05303754
  3.84949332 3.03706766 3.72925816]]


## Chi-Squared Statistical Test on Likes Data
Chi-square test of independence of variables in a contingency table.

### Total Likes by Year

In [27]:
# load data files
total_merged_likes = pd.read_csv('./transformed-data/total_merged_likes.csv')
ca_merged_likes = pd.read_csv('./transformed-data/ca_merged_likes.csv')
gb_merged_likes = pd.read_csv('./transformed-data/gb_merged_likes.csv')
us_merged_likes = pd.read_csv('./transformed-data/us_merged_likes.csv')

In [28]:
likes_cats_total = total_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_total

Unnamed: 0,cat_name,likes_2018,likes_2020
0,Autos & Vehicles,6595338,47126002
1,Comedy,280667076,345219549
2,Education,47858556,85911538
3,Entertainment,794700801,1308107137
4,Film & Animation,224788551,140937638
5,Gaming,102845682,515437678
6,Howto & Style,136562339,131067656
7,Music,2896947716,3280370080
8,News & Politics,47146279,28013042
9,Nonprofits & Activism,14716813,651744


In [29]:
# chi-squared for all categories between 2018 and 2020 USA
col1 = likes_cats_total['likes_2018'].tolist()
col2 = likes_cats_total['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.000000000000000
test statistic: 	423678292.6448148
degrees of freedom: 	14
expected: 
[[2.29006902e+07 2.66807114e+08 5.70244056e+07 8.96398957e+08
  1.55904193e+08 2.63565944e+08 1.14087095e+08 2.63330813e+09
  3.20394154e+07 6.55141072e+06 2.88285898e+08 1.56870274e+07
  1.23336003e+08 1.60472516e+08 4.57810934e+06]
 [3.08206498e+07 3.59079511e+08 7.67456884e+07 1.20640898e+09
  2.09821996e+08 3.54717416e+08 1.53542900e+08 3.54400967e+09
  4.31199056e+07 8.81714628e+06 3.87986502e+08 2.11122186e+07
  1.65990445e+08 2.15970224e+08 6.16139966e+06]]


### Canada Likes by Year

In [30]:
likes_cats_ca = ca_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_ca

Unnamed: 0,cat_name,likes_2018,likes_2020
0,Autos & Vehicles,2700903,14315858
1,Comedy,93509004,142793712
2,Education,11700780,30975522
3,Entertainment,182113870,464265908
4,Film & Animation,44596360,49107274
5,Gaming,27060955,179046330
6,Howto & Style,22445667,49098406
7,Music,271667341,1124097488
8,News & Politics,19109785,10956205
9,People & Blogs,51085931,159616391


In [31]:
# chi-squared for all categories between 2018 and 2020 Great Britain
col1 = likes_cats_ca['likes_2018'].tolist()
col2 = likes_cats_ca['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.000000000000000
test statistic: 	128351788.65051049
degrees of freedom: 	13
expected: 
[[4.22980157e+06 5.87370064e+07 1.06079112e+07 1.60668543e+08
  2.32916111e+07 5.12314252e+07 1.77834802e+07 3.46940776e+08
  7.47340648e+06 5.23735988e+07 2.15581252e+06 2.39180250e+07
  2.63985773e+07 1.05975355e+06]
 [1.27869594e+07 1.77565710e+08 3.20683908e+07 4.85711235e+08
  7.04120229e+07 1.54875860e+08 5.37605928e+07 1.04882405e+09
  2.25925835e+07 1.58328723e+08 6.51715848e+06 7.23057120e+07
  7.98045797e+07 3.20370245e+06]]


### Great Britain Likes by Year

In [32]:
likes_cats_gb = gb_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_gb

Unnamed: 0,cat_name,likes_2018,likes_2020
0,Autos & Vehicles,1451367,16331335
1,Comedy,73381502,88483377
2,Education,13267679,27293277
3,Entertainment,389668969,396094404
4,Film & Animation,101623250,37721937
5,Gaming,51505046,140806073
6,Howto & Style,37941970,32753194
7,Music,2071810219,917909891
8,News & Politics,19507058,6311471
9,People & Blogs,82185539,131985877


In [33]:
# chi-squared for all categories between 2018 and 2020 Great Britain
col1 = likes_cats_gb['likes_2018'].tolist()
col2 = likes_cats_gb['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)
print(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

[[   1451367   73381502   13267679  389668969  101623250   51505046
    37941970 2071810219   19507058   82185539    8610883   19433136
    68406117     693766]
 [  16331335   88483377   27293277  396094404   37721937  140806073
    32753194  917909891    6311471  131985877    3877654   67278880
    72748632     971463]]
p-value: 		0.000000000000000
test statistic: 	394003344.06823325
degrees of freedom: 	13
expected: 
[[1.07113595e+07 9.74988453e+07 2.44317754e+07 4.73302313e+08
  8.39341735e+07 1.15838051e+08 4.25830291e+07 1.80084933e+09
  1.55517168e+07 1.29005538e+08 7.52243442e+06 5.22307340e+07
  8.50241580e+07 1.00304591e+06]
 [7.07134249e+06 6.43660337e+07 1.61291806e+07 3.12461060e+08
  5.54110135e+07 7.64730684e+07 2.81121349e+07 1.18887078e+09
  1.02668122e+07 8.51658782e+07 4.96610258e+06 3.44812820e+07
  5.61305910e+07 6.62183091e+05]]


### USA Likes by Year

In [34]:
likes_cats_us = us_merged_likes.sort_values(by='cat_name').reset_index(drop=True)
likes_cats_us

Unnamed: 0,cat_name,likes_2018,likes_2020
0,Autos & Vehicles,2443068,16478809
1,Comedy,113776570,113942460
2,Education,22890097,27642739
3,Entertainment,222917962,447746825
4,Film & Animation,78568941,54108427
5,Gaming,24279681,195585275
6,Howto & Style,76174702,49216056
7,Music,553470156,1238362701
8,News & Politics,8529436,10745366
9,Nonprofits & Activism,14716813,651744


In [35]:
# chi-squared for all categories between 2018 and 2020 USA
col1 = likes_cats_us['likes_2018'].tolist()
col2 = likes_cats_us['likes_2020'].tolist()
contingency = np.array([col1, col2])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

p-value: 		0.000000000000000
test statistic: 	219942841.9608277
degrees of freedom: 	14
expected: 
[[6.58130931e+06 7.92040543e+07 1.75760694e+07 2.33267155e+08
  4.61471553e+07 7.64722909e+07 4.36127644e+07 6.23226029e+08
  6.70406186e+06 5.34541194e+06 8.74401813e+07 5.43903708e+06
  3.70042608e+07 4.48976188e+07 1.67327590e+06]
 [1.23405677e+07 1.48514976e+08 3.29567666e+07 4.37397632e+08
  8.65302127e+07 1.43392665e+08 8.17779936e+07 1.16860683e+09
  1.25707401e+07 1.00231451e+07 1.63958481e+08 1.01987009e+07
  6.93864342e+07 8.41872152e+07 3.13754810e+06]]


### Chi-squared for Single Categories for 2018 and 2020
###### Is there a difference between views, likes, and dislikes in 2018 vs 2020?

In [37]:
# load data for each year
ca_2018_df = pd.read_csv('./transformed-data/2018_CA_data.csv')
ca_2020_df = pd.read_csv('./transformed-data/2020_CA_data.csv')

gb_2018_df = pd.read_csv('./transformed-data/2018_GB_data.csv')
gb_2020_df = pd.read_csv('./transformed-data/2020_GB_data.csv')

us_2018_df = pd.read_csv('./transformed-data/2018_US_data.csv')
us_2020_df = pd.read_csv('./transformed-data/2020_US_data.csv')

In [38]:
# categorize data by video category
cat_count_ca_2018 = ca_2018_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_ca_2018['likes_to_views'] = ((cat_count_ca_2018['likes'] / cat_count_ca_2018['views'])*100)
cat_count_ca_2018 = cat_count_ca_2018.rename(columns={'category_id':'cat_count'})
# appears that YouTube removed Shows and Movies categories
# https://techpostplus.com/youtube-video-categories-list-faqs-and-solutions/
cat_count_ca_2018 = cat_count_ca_2018[(cat_count_ca_2018['cat_name'] != 'Movies') & (cat_count_ca_2018['cat_name'] != 'Shows')]
cat_count_ca_2018 = cat_count_ca_2018.sort_values(by='likes_to_views')

cat_count_ca_2020 = ca_2020_df.groupby('cat_name').agg({'category_id': 'count', 'views' : 'sum', 'likes': 'sum', 'dislikes': 'sum'}).reset_index()
cat_count_ca_2020['likes_to_views'] = ((cat_count_ca_2020['likes'] / cat_count_ca_2020['views'])*100)
cat_count_ca_2020 = cat_count_ca_2020.rename(columns={'category_id':'cat_count'})
cat_count_ca_2020 = cat_count_ca_2020.sort_values(by='likes_to_views')

# merge yearly data
cat_count_ca_merged = cat_count_ca_2018.merge(cat_count_ca_2020, left_on='cat_name', right_on='cat_name', suffixes=("_2018","_2020"))

In [39]:
single_cat_total_merged = cat_count_ca_merged[cat_count_ca_merged['cat_name'] == 'Gaming']
single_cat_total_merged = single_cat_total_merged[['views_2018', 'views_2020', 'likes_2018', 'likes_2020', 'dislikes_2018', 'dislikes_2020']]

views_2018 = single_cat_total_merged['views_2018'].iloc[0]
views_2020 = single_cat_total_merged['views_2020'].iloc[0]
likes_2018 = single_cat_total_merged['likes_2018'].iloc[0]
likes_2020 = single_cat_total_merged['likes_2020'].iloc[0]
dislikes_2018 = single_cat_total_merged['dislikes_2018'].iloc[0]
dislikes_2020 = single_cat_total_merged['dislikes_2020'].iloc[0]

contingency = [[views_2018, likes_2018, dislikes_2018], [views_2020, likes_2020, dislikes_2020]]
print(contingency)
chi2, p, dof, expected = stats.chi2_contingency(contingency)

print('p-value: \t\t{0:.15f}'.format(p))
print('test statistic: \t' + str(chi2))
print('degrees of freedom: \t' + str(dof))
print('expected: \n' + str(expected))

# The larger the value the easier it is to tell the values are different or not

[[603395844, 27060955, 1130707], [3581267901, 179046330, 4679824]]
p-value: 		0.000000000000000
test statistic: 	388321.4786478772
degrees of freedom: 	2
expected: 
[[6.01144617e+08 2.96081818e+07 8.34707313e+05]
 [3.58351913e+09 1.76499103e+08 4.97582369e+06]]
