In [1]:
import pandas as pd
import json

In [2]:
file = 'watch-history.json'
with open(file, encoding='utf8') as wh_file:
    wh_dict = json.load(wh_file)

In [3]:
# Convert json to dataframe
wh = pd.DataFrame.from_dict(wh_dict)

In [4]:
# Drop columns except title and time
wh = wh[['title', 'time']]

In [5]:
# Remove "Watched" from title
import string
wh['title'] = wh['title'].apply(lambda x: x[7:])
wh['title'] = wh['title'].apply(lambda x: x.lower())
wh = wh.drop(wh[wh['title'].str.startswith('https://www.youtube.com')].index)
wh['title'] = wh['title'].apply(lambda x: x.strip())
wh['title'] = wh['title'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))

In [6]:
# Remove deleted videos
wh = wh.drop(wh[wh['title'].str.startswith('HTTPS://WWW.YOUTUBE.COM')].index)

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oliver\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
stop_words = stopwords.words('english')

In [9]:
def find_ngrams(input_list, n):
    if n > 1:
        return zip(*(input_list[i:] for i in range(n)))
    else:
        return input_list

In [170]:
wh['unigrams'] = wh['title'].apply(lambda x: [word for word in x.split(' ') if word not in stop_words])
wh['unigrams'] = wh['unigrams'].apply(lambda x: [word for word in x if word != ''])
wh['bigrams'] = wh['unigrams'].apply(lambda x: list(find_ngrams(x, 2)))
wh['unigrams'] = wh['unigrams'].apply(lambda x: list(find_ngrams(x, 1)))
# wh['key'] = wh['key'].apply(find_ngrams([word for word in wh['key']]) for word in x if word != ""])
from collections import Counter
bag_1 = Counter()
for keywords in wh['unigrams']:
    for keyword in keywords:
        bag_1[keyword] += 1
bag_remove = []
for word in bag_1.most_common(500):
    word = word[0]
    if word.endswith('s'):
        singular = word[:-1]
        plural = word
    else:
        singular = word
        plural = word + 's'
    if bag_1[plural] >= bag_1[singular]:
        bag_1[plural] += bag_1[singular]
        bag_remove.append(singular)
        wh['unigrams'] = wh['unigrams'].apply(lambda x: [plural if unigram == singular else unigram for unigram in x])
    else:
        bag_1[singular] += bag_1[plural]
        bag_remove.append(plural)
        wh['unigrams'] = wh['unigrams'].apply(lambda x: [singular if unigram == plural else unigram for unigram in x])
for removal in bag_remove:
    del bag_1[removal]
bag_2 = Counter()
for keywords in wh['bigrams']:
    for keyword in keywords:
        bag_2[keyword] += 1

In [172]:
nltk.is_noun("word")

AttributeError: module 'nltk' has no attribute 'is_noun'

In [169]:
THRESHOLD = 0.3
for bigram in bag_2.most_common(1000):
    # print("{}: {}".format(bigram[0][0], bag_1[bigram[0][0]] * 0.75))
    if (bag_1[bigram[0][0]] * THRESHOLD) <= bag_2[bigram[0]]:
        del bag_1[bigram[0][0]]
        if (bag_1[bigram[0][1]] * THRESHOLD) <= bag_2[bigram[0]]:
            del bag_1[bigram[0][1]]
    else:
        del bag_2[bigram[0]]
print(bag_1[('video',)])
wh['ngrams'] = wh['unigrams'] + wh['bigrams']
bag_1_2 = (bag_1 + bag_2)

0


In [11]:
import datetime as dt
wh['time'] = wh['time'].apply(lambda x: dt.datetime.strptime(x.split('T')[0], '%Y-%m-%d'))

In [12]:
import functools
import operator
def avg_datetime(series):
    dt_min = series.min()
    deltas = [(x - dt_min).days for x in series]
    if len(deltas) == 0:
        print(series)
    return dt_min + timedelta(functools.reduce(operator.add, deltas) // len(deltas))

In [153]:
def median_datetime(series):
    dt_min = series.min()
    deltas = [(x - dt_min).days for x in series]
    return dt_min + timedelta(days = deltas[len(deltas)//2])

In [163]:
import plotly.offline as py
import plotly.graph_objs as go
from datetime import datetime, timedelta

import random
import math
from plotly import colors

NUM_KEYWORDS = 350
palette = ['darkturquoise', 'darkorange', 'darkorchid', 'mediumseagreen', 'royalblue', 'saddlebrown', 'tomato']
plotly_colors = [palette[random.randrange(0, len(palette))] for i in range(NUM_KEYWORDS)]

# group_labels = ['minecraft', 'calisthenics', 'sex', 'dating', 'brexit', 'fail', 'minimalist', 'compilation', 'london', 'vegan', 'world', 'trump', 'tinder', 'react', 'flutter', 'summer', 'gopro']
removals = ["video", "trailer", "new", "best", "official", "removed", "music", "ft", "feat"] + [str(x) for x in list(range(100))]
removals += [("official",  "video"), ("official", "trailer"), ("music", "video"), ("official", "music")]
group_labels = list(set([x[0] for x in bag_1_2.most_common(NUM_KEYWORDS)]) - set(removals))
data = pd.DataFrame([], columns=["keyword", "x", "y", "freq"])
for keyword in group_labels:
    dates = wh[wh['ngrams'].apply(lambda x: (True if keyword in x else False))]['time']
    data = data.append({
        "keyword": keyword if isinstance(keyword, str) else " ".join(keyword) ,
        "x": avg_datetime(dates),
        "y": 0,
        "freq": len(dates)
    }, ignore_index=True)
import numpy as np
for year in [2014, 2015, 2016, 2017, 2018, 2019]:
    for month in [1, 4, 7, 10]:
        selected_year = data['x'].apply(lambda x: (True if datetime(year, month, 1) < x <= datetime(year, month, 1) + timedelta(days=90) else False))
        num_selected = len(data[selected_year == True])
        print(num_selected)
        print(data[selected_year]['keyword'])
        ys = [x + 1/(num_selected + 1) + random.uniform(-0.02, +0.02) for x in np.linspace(0,1,num_selected+2)][:-2]
        random.shuffle(ys)
        data.loc[selected_year, 'y'] = ys

0
Series([], Name: keyword, dtype: object)
0
Series([], Name: keyword, dtype: object)
1
157    ukulele
Name: keyword, dtype: object
0
Series([], Name: keyword, dtype: object)
2
189    failarmy
303        2013
Name: keyword, dtype: object
4
48       360
171     2014
174    fails
176    remix
Name: keyword, dtype: object
5
27             hd
35     university
58     comparison
154         gopro
243     minecraft
Name: keyword, dtype: object
12
46          halo
66         sound
97        comedy
99         lapse
135      todoist
147        funny
194         baby
199      android
217    trailer 1
258         play
262         2012
313       speech
Name: keyword, dtype: object
9
31         seconds
36            free
44           cover
73            face
76          action
153        warwick
169    compilation
187        episode
240       ultimate
Name: keyword, dtype: object
25
4             epic
10           super
18            game
19           crazy
32               r
55       animation
65 

In [155]:
data

Unnamed: 0,keyword,x,y,freq
0,programming,2017-08-22,0.818949,34
1,part 1,2017-05-04,0.091583,40
2,netflix,2018-06-18,0.304779,45
3,review,2016-10-16,0.738069,78
4,test,2016-11-20,0.245195,35
5,student,2017-08-24,0.419609,33
6,v,2017-07-07,0.230701,168
7,game,2016-03-24,0.323794,63
8,youtube,2017-08-02,0.634769,37
9,date,2018-01-01,0.000000,29


In [164]:
# Plot
trace = go.Scatter(
    x = data["x"],
    y = data["y"],
    mode = "text",
    text = [x.upper() for x in data["keyword"]],
    opacity=0.75,
    textfont={
        'size': [x // 3.5 for x in list(data["freq"])],
        'color': plotly_colors,
        'family': 'Roboto'
    }
)


data = [trace]
py.plot(data)

'temp-plot.html'

In [81]:
ys

array([0.99428571, 0.04      , 0.15428571, 0.41714286, 0.02285714,
       0.50285714, 0.12      , 0.36      , 0.95428571, 0.42857143,
       0.33714286, 0.89714286, 0.56571429, 0.53142857, 0.2       ,
       0.54285714, 0.74857143, 0.55428571, 0.94857143, 0.58857143,
       0.04571429, 0.56      , 0.79428571, 0.35428571, 0.34285714,
       0.16      , 0.24      , 0.93142857, 0.72      , 0.02857143,
       0.69142857, 0.45714286, 0.68571429, 0.90285714, 0.49142857,
       0.83428571, 0.98285714, 0.08      , 0.87428571, 0.44571429,
       0.64      , 0.00571429, 0.76571429, 0.43428571, 0.98857143,
       0.77142857, 0.72571429, 0.65714286, 0.10857143, 0.29142857,
       0.14285714, 0.88      , 0.21142857, 0.70857143, 0.65142857,
       0.84      , 0.53714286, 0.90857143, 0.61142857, 0.86857143,
       0.40571429, 0.66285714, 0.28      , 0.38285714, 0.11428571,
       0.85714286, 0.4       , 0.16571429, 0.25142857, 0.96571429,
       0.91428571, 0.30857143, 0.57142857, 0.76      , 0.82857

In [90]:
import numpy as np
for year in [2015, 2016, 2017, 2018, 2019]:
    selected_year = data['x'].apply(lambda x: (True if datetime(year, 1, 1) < x < datetime(year+1, 1, 1) else False))
    num_selected = len(data[selected_year == True])
    ys = np.linspace(0,1,num_selected)
    random.shuffle(ys)
    print(num_selected)
    data.loc[selected_year, 'y'] = ys

14
59
86
15
1


In [88]:
data

Unnamed: 0,keyword,x,y,freq
0,programming,2017-08-16,0.869552,34
1,part 1,2016-12-23,0.344828,40
2,netflix,2017-10-18,0.991760,44
3,thenx,2018-05-22,0.735074,26
4,review,2016-05-21,0.706897,73
5,test,2016-07-05,0.982759,33
6,minutes,2017-06-06,0.418962,56
7,game,2016-06-27,0.568966,53
8,youtube,2017-05-20,0.221309,36
9,crazy,2016-06-22,0.500000,28


In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# generate some random data (approximately over 5 years)

# convert the epoch format to matplotlib date format 
mpl_data = wh['time']

# plot it
fig, ax = plt.subplots(1,1)
ax.hist(mpl_data, bins=100, color='lightblue')
locator = mdates.AutoDateLocator()
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(locator))
plt.show()

In [None]:
keyword = "date"
wh['freq'] = wh['key'].apply(lambda x: 1 if keyword in x else 0)

wh[wh['freq'] == 1]

In [None]:
from collections import Counter
bag = Counter()
for keywords in wh['key']:
    for keyword in keywords:
        bag[keyword] += 1
#     for i in range(len(keywords)-1):
#         bag[" ".join(keywords[i:i+2])] += 1

In [None]:
import plotly.offline as py
import plotly.graph_objs as go

import numpy as np

x0 = np.random.randn(500)
x1 = np.random.randn(500)+1

trace1 = go.Histogram(
    x=x0,
    opacity=0.75
)
trace2 = go.Histogram(
    x=x1,
    opacity=0.75
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)
py.init_notebook_mode(connected=True)
py.iplot(fig, filename='overlaid histogram')

In [None]:
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
data = []

def to_unix_time(datet):
    epoch =  dt.datetime.utcfromtimestamp(0)
    return (datet - epoch).total_seconds() * 1000

group_labels = ['minecraft', 'calisthenics', 'sex', 'dating', 'brexit', 'fail', 'minimalist', 'compilation', 'london', 'vegan', 'world', 'trump', 'tinder', 'react', 'flutter', 'summer', 'gopro']
# group_labels = ['calisthenics', 'workout', 'vegan', 'dating', 'minimalist', 'mimalism', 'vegetarian']

# group_labels = [x for x,_ in bag.most_common(10)]
print(group_labels)
hist_data = []

for keyword in group_labels:
    print(keyword)
    data.append(
        go.Histogram(
            x=wh[wh['key'].apply(lambda x: (True if keyword in x else False))]['time'],
            xbins=dict(
                    start=dt.datetime(2013, 1, 1),
                    end=dt.datetime(2018, 12, 31),
                    size=(to_unix_time(dt.datetime(2019, 4, 1))-to_unix_time(dt.datetime(2019, 1, 1)))
                ),
            name=keyword,
            opacity=0.75
        )
    )

layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
py.init_notebook_mode(connected=True)
py.iplot(fig, filename='overlaid histogram')

In [None]:
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
data = []

def to_unix_time(datet):
    epoch =  dt.datetime.utcfromtimestamp(0)
    return (datet - epoch).total_seconds() * 1000

group_labels = ['minecraft', 'calisthenics', 'sex', 'dating', 'brexit', 'fail', 'minimalist', 'compilation', 'london', 'vegan', 'world', 'trump', 'tinder', 'react', 'flutter', 'summer', 'gopro']
# group_labels = ['calisthenics', 'workout', 'vegan', 'dating', 'minimalist', 'mimalism', 'vegetarian']

# group_labels = [x for x,_ in bag.most_common(10)]
print(group_labels)
hist_data = []


trace = go.Scatter(
    x = wh[wh['key'].apply(lambda x: (True if keyword in x else False))]['time'],
    y = random.sample(rang(0, 1), len(x))
    
)

for keyword in group_labels:
    data.append(
        go.Histogram(
            x=wh[wh['key'].apply(lambda x: (True if keyword in x else False))]['time'],
            xbins=dict(
                    start=dt.datetime(2013, 1, 1),
                    end=dt.datetime(2018, 12, 31),
                    size=(to_unix_time(dt.datetime(2019, 4, 1))-to_unix_time(dt.datetime(2019, 1, 1)))
                ),
            name=keyword,
            opacity=0.75
        )
    )

layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
py.init_notebook_mode(connected=True)
py.iplot(fig, filename='overlaid histogram')

In [None]:
import plotly.offline as py
import plotly.figure_factory as ff

import numpy as np
# group_labels = ['minecraft', 'calisthenics', 'sex', 'dating', 'brexit', 'fail', 'minimalist', 'compilation', 'london', 'vegan', 'world', 'trump', 'tinder', 'react', 'flutter', 'summer', 'gopro']
group_labels = ['flutter', 'android', 'react', 'python', 'java']
hist_data=[]
for keyword in group_labels:
    hist_data.append(wh[wh['key'].apply(lambda x: (True if keyword in x else False))]['time'].apply(lambda x: 1970 + (to_unix_time(x))/31536000000))

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, show_hist=False)

# Add title
fig['layout'].update(title='Curve and Rug Plot')

# Plot!
py.iplot(fig, filename='Curve and Rug')

In [None]:
import plotly.figure_factory as ff
# Group data together
group_labels = [x for x,_ in bag.most_common(10)]
hist_data = []
for keyword in group_labels:
    reply_list = list(wh[wh['key'].apply(lambda x: (True if keyword in x else False))]['time'])
    reply_list = [to_unix_time(x) for x in reply_list]
    hist_data.append(reply_list)

    
    
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')

In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# bins = np.arange(datetime(2013, 1, 1), datetime(2019, 1, 1), timedelta(days=30)).astype(datetime)
bins = np.linspace(0, 360, 10)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
# generate some random data (approximately over 5 years)

# convert the epoch format to matplotlib date format 
mpl_data = wh[wh['freq'] == 1]['time']


# plot it
fig, ax = plt.subplots(1,1)
for keyword in ['london', 'calisthenics', 'fails']:
    mpl_data = wh[wh['key'].apply(lambda x: (True if keyword in x else False))]['time']
    ax.hist(mpl_data, bins=50, alpha=0.5, label=keyword)
locator = mdates.AutoDateLocator()
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(locator))
plt.show()

In [None]:
wh[wh['key'].apply(lambda x: (True if 'halo' in x else False))]

In [None]:
def levenshtein(seq1, seq2):  
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [None]:
import itertools
list_words = [x for x,_ in bag.most_common(200)]
for a,b in itertools.product(list_words, repeat=2):
    ld = levenshtein(a, b)
    if ld != 0 and ld < 3:
        print(a,b)

In [None]:
1970 + (to_unix_time(dt.datetime(2019,1,1)))/31536000000

In [None]:
from collections import defaultdict
d = defaultdict(list)
d['programming'] = ['flutter', 'react', 'python', 'code', 'ansible', 'programming', 'coding']
d['diet'] = ['vegan', 'food', 'cooking', 'vegetarian']
d['politics'] = ['trump', 'brexit']
d['dating'] = ['date', 'dating', 'tinder', 'bumble', 'girlfriend']
d['minimalism'] = ['minimalism', 'essentialism', 'minimalist']
d['exercise'] = ['calisthenics', 'gym', 'freeletics', 'thenx']
d['videogames'] = ['halo', 'minecraft', 'xbox', 'gameplay']
d['compilations'] = ['compilation', 'fail', 'failarmy', 'fails']
d['routines'] = ['routine', 'habits']
for key, array in d.items():
    print(array)

In [None]:
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
data = []

def to_unix_time(datet):
    epoch =  dt.datetime.utcfromtimestamp(0)
    return (datet - epoch).total_seconds() * 1000

group_labels = ['minecraft', 'calisthenics', 'sex', 'dating', 'brexit', 'fail', 'minimalist', 'compilation', 'london', 'vegan', 'world', 'trump', 'tinder', 'react', 'flutter', 'summer', 'gopro']
# group_labels = ['calisthenics', 'workout', 'vegan', 'dating', 'minimalist', 'mimalism', 'vegetarian']

# group_labels = [x for x,_ in bag.most_common(10)]
print(group_labels)
hist_data = []

for key, array in d.items():
    set_obj = set()
    for idea in array:
        set_obj = set_obj.union(wh[wh['key'].apply(lambda x: (True if idea in x else False))]['time'])
    print(key)
    data.append(
        go.Histogram(
            x=list(set_obj),
            xbins=dict(
                    start=dt.datetime(2013, 1, 1),
                    end=dt.datetime(2018, 12, 31),
                    size=(to_unix_time(dt.datetime(2019, 4, 1))-to_unix_time(dt.datetime(2019, 1, 1)))
                ),
            name=key,
            opacity=0.75\
        )
    )

layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
py.init_notebook_mode(connected=True)
py.iplot(fig, filename='overlaid histogram')

In [None]:
import spacy

nlp = spacy.load('en_core_web_md')
tokens = nlp(u'dog cat banana afskfsd')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)