In [2]:
import pandas as pd
import numpy as np
import random 
import datetime
import isodate
from urllib.parse import urlparse
from apiclient.discovery import build

In [3]:
DEVELOPER_KEY = "YOURAPIKEY"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

# The API has a rate limit per day, so that we store the result of previous days in results_vn
df = pd.read_csv("DIRECTORY/youtube_clicks.csv")
results_v1 = pd.read_csv("DIRECTORY/results/result_v1.csv")

#### List of unique video IDs

In [4]:
# Some videos I watched more often over the years so we do not need to waste API requests 
# in case I already extracted info about them
done = results_v1.video_id.unique()
videos = df[df.Video_ID.isin(done) == False].Video_ID.unique()

In [5]:
done.shape

(1408,)

In [6]:
videos.shape

(7213,)

#### Empty Dataframe

In [7]:
column_names = ['video_id', "categoryId", 'duration', 'dimension', 'definition', 'caption', 'licensedContent', 'projection', 'publishedAt',
               'description', 'tags', 'defaultAudioLanguage', 'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount', 'commentCount']

df_video = pd.DataFrame(columns = column_names)

#### Get video info for each video_id

In [9]:
stats = ["viewCount", "likeCount", "dislikeCount", "favoriteCount", "commentCount"]
snippet = ["publishedAt", "description", "tags", "defaultAudioLanguage", "categoryId"]
content = ["duration", "caption", "definition", "dimension", "licensedContent", "projection"]

In [10]:
for vid in videos:
    
    df_vid = pd.DataFrame(columns=column_names)
    df_vid.video_id = pd.Series([vid])
    
    try:
        response = youtube.videos().list(part='snippet, contentDetails, statistics', id=vid).execute()
        df_vid["got_info"] = "yes"
    except:
        df_vid["got_info"] = "no"
        continue
         
    for x in snippet:
        try:
            df_vid[x] = pd.Series([response['items'][0]["snippet"][x]])
        except:
            df_vid[x] = "no data"
        
    for x in content:
        try:
            df_vid[x] = pd.Series([response['items'][0]["contentDetails"][x]])
        except:
            df_vid[x] = "no data"
    
    for x in stats:
            try:
                df_vid[x] = pd.Series([response['items'][0]["statistics"][x]])
            except:
                df_vid[x] = "no data"
    
    
    df_video = df_video.append(df_vid, sort = True)

In [12]:
df_video.shape

(7213, 18)

In [44]:
# Safe progress to be used at the end of all API requests
df_video.to_csv("YOUR/DIRECTORY/video_info_v2.csv", index=False)

In [45]:
# Get videos from previous day
df_video_v1 = pd.read_csv("YOUR/DIRECTORY/video_info_v1.csv")

In [46]:
# videos_total holds video API info of all past request of the previous days
videos_total = df_video_v1.append(df_video)
videos_total.shape

(2277, 18)

#### Join Video_ID Infos to Video_Clicks

We now merge that API info to our click info (data watched, watched after search, etc.)

In [47]:
result = pd.merge(df, videos_total, how='left', left_on=['Video_ID'], right_on=["video_id"])

And mark all those videos as done

In [48]:
result = result[result.got_info == "yes"]

In [50]:
result.to_csv("DIRECTORY/result_v2.csv", index = False)

### Turn categoryID into category_name

In [23]:
categories = ['', 'Film & Animation', 'Autos & Vehicles', '', '', '', '', '', '', '', 'Music', '', '', '', '', 
            'Pets & Animals', '', 'Sports', 'Short Movies', 'Travel & Events', 'Gaming', 'Videoblogging', 
              'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 
              'Science & Technology', 'Nonprofits & Activism', 'Movies', 'Anime/Animation', 'Action/Adventure', 
              'Classics', 'Comedy', 'Documentary', 'Drama', 'Family', 'Foreign', 'Horror', 'Sci-Fi/Fantasy', 'Thriller', 
              'Shorts', 'Shows', 'Trailers']

In [24]:
def find_cat(x):
    if (x != "no data") & (x != "rate limit") & (x != "no category"):
        return categories[int(x)]

In [25]:
videos_total["category_name"] = videos_total["categoryId"].apply(lambda x: find_cat(x))

### Convert duration to Timedelta and get total Seconds

In [62]:
videos_total = pd.read_csv("PATH/TO/DIRECTORY/youtube_clicks_info.csv")

In [57]:
results_final = results_final[(results_final["duration"] != "no data") & (results_final["duration"]!= "NaN")]
results_final["duration"] = results_final["duration"].apply(lambda x: isodate.parse_duration(x))
results_final["duration in seconds"] = results_final["duration"].apply(lambda x: x.total_seconds())

In [65]:
results_final[results_final.Year == 2016].shape

(203, 30)

### Substract Clicked_Date from PublishedAt

In [46]:
results_final.columns

Index(['Video_Title', 'Video_URL', 'Video_Channel', 'video_id', 'Video_Date',
       'year', 'hour', 'month', 'dayofweek', 'category', 'category_name',
       'caption', 'commentCount', 'defaultAudioLanguage', 'definition',
       'description', 'dimension', 'dislikeCount', 'duration', 'favoriteCount',
       'got_info', 'licensedContent', 'likeCount', 'projection', 'publishedAt',
       'tags', 'viewCount', 'duration in seconds'],
      dtype='object')

In [47]:
results_final["time_online_until_watched"] = pd.to_datetime(results_final["Video_Date"], utc=True) - \
                                    pd.to_datetime(results_final["publishedAt"], errors="coerce")

Convert time to days

In [53]:
results_final["days_online_until_watched"] = results_final["time_online_until_watched"].apply(lambda x: x.days)

In [67]:
results_final = pd.read_csv("EXPORT/PATH/results/result_v6.csv")