In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pprint import pprint
import math
from pprint import pprint

from matplotlib.pyplot import figure, show
from numpy.random import rand

import mpld3
mpld3.enable_notebook()

%matplotlib notebook


In order to determine popular words used in well-liked Youtube videos, we first need to determine which metric is most useful for our comedy videos.  Using the Youtube trending video data set (https://www.kaggle.com/quannguyen135/what-is-trending-on-youtube-eda-with-python/data), we will do the following:

1. See the average number of likes, dislikes, and comments by category to determine which metric is most useful for entertainment and comedy.
2. two
3. three


In [2]:
#load trending videos
trending_videos = pd.read_csv('USvideos.csv')

#update types as follows:
#integer:  views, like,s dislikes, and comment_count
#string:  category_id
type_int_list = ['views', 'likes', 'comment_count', 'dislikes']
for column in type_int_list:
    trending_videos[column] = trending_videos[column].astype(int)

type_str_list = ['category_id']
for column in type_str_list:
    trending_videos[column] = trending_videos[column].astype(str)

#group trending videos by days trending
top_trending_videos_days = trending_videos.groupby(['title']).size().reset_index(name='trending_days_count').sort_values(['trending_days_count'], ascending=[False])

top_trending_videos_days.head(10)

Unnamed: 0,title,trending_days_count
3618,"Selena Gomez, Marshmello - Wolves",18
2760,Maroon 5 - Wait,18
2622,Lucas the Spider - Polar Bear,16
849,Chris Young - Hangin' On,16
51,*cough*,16
1425,Fluffy Sleepy Whispers ASMR,15
4705,Why Bridges Move...,15
1571,Gotta Catch ‘Em All!,15
2691,Made in Miami (Artist Spotlight Story) - Camil...,15
3364,Ralph Breaks The Internet: Wreck-It Ralph 2 Of...,15


In [183]:
# load id category
# creates a dictionary that maps `category_id` to `category`
id_to_category = {}

with open('US_category_id.json', 'r') as f:
    data = json.load(f)
    for category in data['items']:
        id_to_category[category['id']] = category['snippet']['title']

#set a new 'category' column using the newly-created dictionary
trending_videos['category'] = trending_videos['category_id'].map(id_to_category)

#set a new 'like_score' column that represents the ratio likes:views
trending_videos['like_score'] = trending_videos['likes']/trending_videos['views']


In [4]:
#function to reject outliers
def reject_outliers(data,col, m):
    return data[abs(data[col] - np.mean(data[col])) < m * np.std(data[col])]

In [None]:
def stacked_plot(df, type_plot_list, category_list, barwidth, adj_bottom):

    plt.figure(figsize=(10, 8))

    cat_count = len(category_list)
    ind = np.arange(cat_count)
    
    #loop through to create lists for bars, and cumulative bottom for each stack
    list_counter = 0
    levels=[]
    stacks=[]
    for i in type_plot_list:
        stacks.append(list(df[i]))
        if list_counter == 0:
            levels.append([0]*cat_count)
        else:
            levels.append([x + y for x, y in zip(levels[list_counter-1], stacks[list_counter-1])])

        plt.bar(ind, stacks[list_counter],bottom=levels[list_counter], width=barwidth, edgecolor='white')

        list_counter = list_counter + 1



    plt.ylabel('User Interactions')
    plt.title('Interactions by type')
    plt.xticks(ind, category_list)
    plt.legend(type_plot_list)

    plt.xticks(rotation=90)
    plt.show()
    plt.gcf().subplots_adjust(bottom=adj_bottom)  

In [5]:
#barplot function
def barplot_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom):
    
    plt.figure(figsize=(figsize_x, figsize_y))
    
    if cat == 'ALL':
            result = df.groupby([x_axis])[y_axis].aggregate(np.mean).reset_index().sort_values(y_axis,ascending=[False])
            sns.barplot(x=x_axis, y=y_axis, data=df.head(num_data), order=result[x_axis])
            plt.title('Top ' + x_axis.capitalize() + ' by ' + y_axis.capitalize())
    else:
        result = df[df['category'] == cat].sort_values([y_axis], ascending = [False]).head(num_data).groupby([x_axis])[y_axis].aggregate(np.mean).reset_index().sort_values(y_axis,ascending=[False])
        sns.barplot(x=x_axis, y=y_axis, data=df[df['category'] == cat].sort_values([y_axis], ascending = [False]).head(num_data), order=result[x_axis])
        plt.title('Top ' + x_axis.capitalize() + ' by ' + y_axis.capitalize() + ' - ' + cat)

    sns.set(style="darkgrid")
    
    plt.xticks(rotation=90)
    plt.show()
    plt.gcf().subplots_adjust(bottom=adj_bottom)  

In [6]:
#histogram function
def histogram_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom):
    
    plt.figure(figsize=(figsize_x, figsize_y))
    
    if cat == 'ALL':            
            plt.hist(df[y_axis], color='red',label=y_axis, bins=100)  
            plt.xlabel(y_axis.capitalize())
            plt.legend(loc='upper right')
            plt.title('Histogram - Youtube Trending Video ' + y_axis.capitalize())    
    else:
        plt.hist(df[df['category'] == cat][y_axis], color='red',label=y_axis, bins=100)  
        plt.xlabel(y_axis.capitalize())
        plt.legend(loc='upper right')
        plt.title('Histogram - Youtube Trending Video ' + y_axis.capitalize() + ' - ' + cat.capitalize())
    
    plt.xticks(rotation=90)
    plt.show()
    plt.gcf().subplots_adjust(bottom=adj_bottom)  

In [7]:
#boxplot function
def boxplot_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom):
    
    plt.figure(figsize=(figsize_x, figsize_y))
    
    if cat == 'ALL':            
            sns.boxplot(x='category', y=y_axis,data=df) 
            sns.despine(offset=10, trim=True)            
            plt.ylabel(y_axis.capitalize())
            plt.legend(loc='upper right')
            plt.title('Boxplot - Youtube Trending Video ' + y_axis.capitalize())    
    
    plt.xticks(rotation=90)
    plt.show()
    plt.gcf().subplots_adjust(bottom=adj_bottom)  

In [8]:
#create scatterplot function
def scatterplot_youtube_old(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y):

    fig, ax = plt.subplots(subplot_kw=dict(facecolor='#EEEEEE'))
    #fig.set_figheight(figsize_y)
    #fig.set_figwidth(figsize_x)

    
    if cat == 'ALL':            
        scatter = ax.scatter(df[x_axis],df[y_axis])   
        labels = list(df['title'])
        ax.set_xlabel(x_axis.capitalize())
        ax.set_ylabel(y_axis.capitalize())
        ax.set_title(x_axis.capitalize() + ' vs. ' + y_axis.capitalize())

    else:
        scatter = ax.scatter(df[df['category'] == cat][x_axis], df[df['category'] == cat][y_axis])
        labels = list(df[df['category'] == cat]['title'])
        ax.set_xlabel(x_axis.capitalize())
        ax.set_ylabel(y_axis.capitalize())
        ax.set_title(x_axis.capitalize() + ' vs. ' + y_axis.capitalize() + ' - ' + cat.capitalize())
        
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)
    #mpld3.enable_notebook()
    mpld3.show()

In [9]:
#create scatterplot function
def scatterplot_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y):

    plt.figure(figsize=(figsize_x, figsize_y))
    
    if cat == 'ALL':            
        plt.scatter(df[x_axis],df[y_axis])  
        plt.xlabel(x_axis.capitalize())
        plt.ylabel(y_axis.capitalize())
        plt.title(x_axis.capitalize() + ' vs. ' + y_axis.capitalize())    
    else:
        plt.scatter(df[df['category'] == cat][x_axis], df[df['category'] == cat][y_axis])  
        plt.xlabel(x_axis.capitalize())
        plt.ylabel(y_axis.capitalize())
        plt.title(x_axis.capitalize() + ' vs. ' + y_axis.capitalize() + ' - ' + cat.capitalize())
    
    plt.show()

In [10]:
#create plot functions 
def plot_youtube(data_orig, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom, plot_type, is_reject_outliers):
    
    #reject outliers if parameter is set
    if is_reject_outliers == 1:
        df = reject_outliers(data_orig,y_axis,2)
    else:
        df = data_orig
    
    #plot types:  barplot, historgram, boxplot
    if plot_type == 'barplot':
        barplot_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom)
    
    elif plot_type == 'hist':
        histogram_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom)

    elif plot_type == 'boxplot':
        boxplot_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y, adj_bottom)
    
    elif plot_type == 'scatterplot':
        scatterplot_youtube(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y)

In [189]:
scatterplot_youtube_old(trending_videos,'like_score','likes', 24592,'ALL',10,8)


#def scatterplot_youtube_old(df, x_axis, y_axis,num_data, cat, figsize_x, figsize_y):

<IPython.core.display.Javascript object>


Note: if you're in the IPython notebook, mpld3.show() is not the best command
      to use. Consider using mpld3.display(), or mpld3.enable_notebook().
      See more information at http://mpld3.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8891/    [Ctrl-C to exit]


127.0.0.1 - - [01/Apr/2018 19:19:00] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Apr/2018 19:19:01] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [01/Apr/2018 19:19:01] "GET /mpld3.js HTTP/1.1" 200 -



stopping Server...


In [186]:
#plots based on the 4 metrics
top_trending_videos_by_category = trending_videos.groupby(['category'])[["views", "likes", "comment_count","dislikes"]].mean().reset_index()
type_plot_list = ['views', 'likes', 'comment_count', 'dislikes']
category_list = list(top_trending_videos_by_category.category.unique())


stacked_plot(top_trending_videos_by_category, type_plot_list, category_list, .35, .3)



for column in type_plot_list:
    #plot_youtube(trending_videos,'category',column, 24592,'ALL',10,6, .3, 'barplot',0)
    #plot_youtube(trending_videos,'title',column, 300,'Comedy',10,10, .6, 'barplot',0)
    #plot_youtube(trending_videos,'',column, 24592,'ALL',10,6, .3, 'hist',0)
    #plot_youtube(trending_videos,'',column, 24592,'Comedy',10,6, .3, 'hist',0)
    #plot_youtube(trending_videos,'',column, 24592,'ALL',10,8, .3, 'boxplot',0)
    #plot_youtube(trending_videos,'',column, 24592,'ALL',10,8, .3, 'boxplot',1)
    plot_youtube(trending_videos,'like_score',column, 24592,'ALL',10,8, .3, 'scatterplot',0)




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [165]:
def tf(word, blob):
    return blob.count(word) / len(blob)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [166]:
titles = {
    'my horibal speling': 1000,
    'dance dance, revolution': 100,
    'Maria Carey - Yo yo': 5000,
    'The cat goes to town': 100000,
    'The cat goes to market': 100000,
    'The cat goes wee wee wee all the way home': 100000,
    'Maria carey - what does the cat say': 50,
}

def cleanup(title):
    word_mapping = {
        
    }
    words = title.split()
    words = [word.lower() for word in words if word.isalpha()]
    words = [word_mapping.get(word, word) for word in words]
    
    return tuple(words)

In [178]:
cleaned_titles = [cleanup(t) for t in titles]
words={}
words = {word for words in cleaned_titles for word in words}

#for words in cleaned_titles:
#        for word in words
#            word



{'all',
 'carey',
 'cat',
 'dance',
 'does',
 'goes',
 'home',
 'horibal',
 'maria',
 'market',
 'my',
 'revolution',
 'say',
 'speling',
 'the',
 'to',
 'town',
 'way',
 'wee',
 'what',
 'yo'}

In [181]:
for t, cleaned in zip(titles, cleaned_titles):
    print(t)
    print(cleaned)

my horibal speling
('my', 'horibal', 'speling')
dance dance, revolution
('dance', 'revolution')
Maria Carey - Yo yo
('maria', 'carey', 'yo', 'yo')
The cat goes to town
('the', 'cat', 'goes', 'to', 'town')
The cat goes to market
('the', 'cat', 'goes', 'to', 'market')
The cat goes wee wee wee all the way home
('the', 'cat', 'goes', 'wee', 'wee', 'wee', 'all', 'the', 'way', 'home')
Maria carey - what does the cat say
('maria', 'carey', 'what', 'does', 'the', 'cat', 'say')


In [167]:
cleaned_titles = [cleanup(t) for t in titles]
words = {word for words in cleaned_titles for word in words}

for t, cleaned in zip(titles, cleaned_titles):
    print(t)
    for word in set(cleaned):
        print("{}: {}".format(word, tfidf(word, cleaned, cleaned_titles)))
    print()

my horibal speling
horibal: 0.41758765616512267
speling: 0.41758765616512267
my: 0.41758765616512267

dance dance, revolution
dance: 0.626381484247684
revolution: 0.626381484247684

Maria Carey - Yo yo
yo: 0.626381484247684
carey: 0.21182446509680092
maria: 0.21182446509680092

The cat goes to town
goes: 0.11192315758708454
to: 0.16945957207744075
the: 0.06729444732424258
cat: 0.06729444732424258
town: 0.2505525936990736

The cat goes to market
goes: 0.11192315758708454
to: 0.16945957207744075
the: 0.06729444732424258
cat: 0.06729444732424258
market: 0.2505525936990736

The cat goes wee wee wee all the way home
goes: 0.05596157879354227
all: 0.1252762968495368
way: 0.1252762968495368
home: 0.1252762968495368
the: 0.06729444732424258
wee: 0.3758288905486104
cat: 0.03364722366212129

Maria carey - what does the cat say
does: 0.17896613835648115
the: 0.04806746237445898
carey: 0.12104255148388623
what: 0.17896613835648115
cat: 0.04806746237445898
say: 0.17896613835648115
maria: 0.12104255

In [None]:
word_values = {word: 0.0 for word in words}

for word in words:
    for t, cleaned in zip(titles, cleaned_titles):
        word_values[word] += titles[t] * tfidf(word, cleaned, cleaned_titles)

In [None]:
pprint(list(sorted([(v, w) for w, v in word_values.items()], reverse=True)))

In [184]:
trending_videos.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,like_score
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,People & Blogs,0.076869
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",Entertainment,0.040179
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,Comedy,0.045758
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,Entertainment,0.029641
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,Entertainment,0.063097
