In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from datetime import datetime as dt
%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

Read json text

In [2]:
import json

json_data = json.loads(open('../data/video_json.txt').read())
json_data[1]

{'etag': '"DuHzAJ-eQIiCIp7p4ldoVcVAOeY/tAr09q1XjvqiiqWnGh8Xg2P8mzg"',
 'items': [{'etag': '"DuHzAJ-eQIiCIp7p4ldoVcVAOeY/0-ze2anWOfyPO-EEuYJurUB2bcs"',
   'id': 'lLWEXRAnQd0',
   'kind': 'youtube#video',
   'snippet': {'categoryId': '24',
    'channelId': 'UCxcnsr1R5Ge_fbTu5ajt8DQ',
    'channelTitle': 'Bob Ross',
    'defaultAudioLanguage': 'en',
    'defaultLanguage': 'en',
    'description': 'Take a walk with Bob Ross down a little lakeside path in a secluded place; you’ll delight in the discovery of a small uninhabited island. \n\nSeason 29 of The Joy of Painting with Bob Ross features the following wonderful painting instructions: island in the Wilderness, Autumn Oval, Seasonal Progression, Light at the Summit, Countryside Barn, Mountain Lake Falls, Cypress Creek, Trapper’s Cabin, Storm on the Horizon, Pot O’ Posies, A Perfect Winter Day, Aurora’s Dance, and Woodman’s Retreat.\n\nSubscribe to the official Bob Ross YouTube channel - http://bit.ly/BobRossSubscribe\n\nSeason 29 Playli

This sample entry looks like the relevant information in in ```items```, which is a list with just one entry.  Is that true?

In [3]:
all(len(js['items']) == 1 for js in json_data)

True

Define a function to extract parts from the ```items``` list

In [4]:
def extract_record(js):
    js = js['items'][0]
    record = {}
    record['id'] = js['id']
    record['etag'] = js['etag']
    for stat, value in js['statistics'].items():
        record[stat] = int(value)
    for snippet, value in js['snippet'].items():
        record[snippet] = value
    return record

In [5]:
records = [extract_record(js) for js in json_data]
record_df = pd.DataFrame.from_records(records)
print("Read %d video records" % len(record_df))
record_df.head(2)

Read 446 video records


Unnamed: 0,categoryId,channelId,channelTitle,commentCount,defaultAudioLanguage,defaultLanguage,description,dislikeCount,etag,favoriteCount,id,likeCount,liveBroadcastContent,localized,publishedAt,tags,thumbnails,title,viewCount
0,24,UCxcnsr1R5Ge_fbTu5ajt8DQ,Bob Ross,1589,en,,Bundle up with Bob Ross when you visit this to...,311,"""DuHzAJ-eQIiCIp7p4ldoVcVAOeY/YgvQsJek2VuEzG0fc...",0,TohG7F8M3Ls,14005,none,{'title': 'Bob Ross - Glacier Lake (Season 28 ...,2016-11-20T21:00:01.000Z,"[twitch, the joy of painting, asmr, happy trai...",{'default': {'url': 'https://i.ytimg.com/vi/To...,Bob Ross - Glacier Lake (Season 28 Episode 6),1160972
1,24,UCxcnsr1R5Ge_fbTu5ajt8DQ,Bob Ross,21209,en,en,Take a walk with Bob Ross down a little lakesi...,2249,"""DuHzAJ-eQIiCIp7p4ldoVcVAOeY/0-ze2anWOfyPO-EEu...",0,lLWEXRAnQd0,152477,none,{'title': 'Bob Ross - Island in the Wilderness...,2016-11-28T21:00:02.000Z,"[steven ross, bob ross inc, pastel, bob ross j...",{'default': {'url': 'https://i.ytimg.com/vi/lL...,Bob Ross - Island in the Wilderness (Season 29...,7529170


In [6]:
record_df_slim = record_df[['title','id', 'viewCount', 'likeCount', 'favoriteCount', 'dislikeCount', 'commentCount']]
df = record_df_slim.set_index('id')
df.head()

Unnamed: 0_level_0,title,viewCount,likeCount,favoriteCount,dislikeCount,commentCount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TohG7F8M3Ls,Bob Ross - Glacier Lake (Season 28 Episode 6),1160972,14005,0,311,1589
lLWEXRAnQd0,Bob Ross - Island in the Wilderness (Season 29...,7529170,152477,0,2249,21209
kJFB6rH3z2A,Bob Ross - Reflections of Calm (Season 31 Epis...,1432050,17979,0,334,2346
kasGRkfkiPM,Bob Ross - Mountain Summit (Season 13 Episode 10),3041171,22498,0,542,3310
tWoInh2USOs,Bob Ross - Delightful Meadow Home (Season 26 E...,453935,3930,0,96,457


In [7]:
import re
regex = r'^Bob Ross - (?P<episode_title>[A-Za-z0-9"\'& ]+) \(Season (?P<season_num>\d+) Episode (?P<episode_num>\d+)\)'
newcols = df['title'].str.extract(regex)
newcols.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,episode_title,season_num,episode_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TohG7F8M3Ls,Glacier Lake,28,6
lLWEXRAnQd0,Island in the Wilderness,29,1
kJFB6rH3z2A,Reflections of Calm,31,1
kasGRkfkiPM,Mountain Summit,13,10
tWoInh2USOs,Delightful Meadow Home,26,2


In [8]:
df = pd.concat([df, newcols], axis=1)

In [9]:
df['episode_title'] = df['episode_title'].str.lower()

In [11]:
df.head()

Unnamed: 0_level_0,title,viewCount,likeCount,favoriteCount,dislikeCount,commentCount,episode_title,season_num,episode_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TohG7F8M3Ls,Bob Ross - Glacier Lake (Season 28 Episode 6),1160972,14005,0,311,1589,glacier lake,28,6
lLWEXRAnQd0,Bob Ross - Island in the Wilderness (Season 29...,7529170,152477,0,2249,21209,island in the wilderness,29,1
kJFB6rH3z2A,Bob Ross - Reflections of Calm (Season 31 Epis...,1432050,17979,0,334,2346,reflections of calm,31,1
kasGRkfkiPM,Bob Ross - Mountain Summit (Season 13 Episode 10),3041171,22498,0,542,3310,mountain summit,13,10
tWoInh2USOs,Bob Ross - Delightful Meadow Home (Season 26 E...,453935,3930,0,96,457,delightful meadow home,26,2


In [12]:
df.to_csv('../data/video_stats2.csv')