# Creation of the dataset


## Cleaning the existing dataset

First of all, after the dataset was published, some of the videos have become private and therefore we do not want them in the dataset. So, first we have to detect and clean all private videos

In [1]:
import youtube_comments as yc

In [2]:
%%time
# Here we check the time it takes to check if a non-private video is private
yc.is_video_private("7zCIRPQ8qWc")

CPU times: user 38.7 ms, sys: 6.86 ms, total: 45.5 ms
Wall time: 556 ms


False

In [3]:
%%time
# Here we check the time it takes to check if a private video is private
yc.is_video_private("fY_FQMQpjok")

CPU times: user 31.1 ms, sys: 3.83 ms, total: 35 ms
Wall time: 384 ms


True

In [9]:
%%time
# This is a test if all the comments of a youtube video are downloaded correctly
youtube_id = "7zCIRPQ8qWc"
for comment in yc.download_comments(youtube_id):
    print(comment['text'])

"Hilarious" isn't the word I would use. I would use "boring".﻿
great video !!!!!!!!!!﻿
that's so funny LLLLLLLOOOOOOOOLLLLLLLL!!!!!!!!﻿
Waha haha hahaha﻿
Waha haha hahaha﻿
i couldn't stop laughing and feeling bad for the cats
﻿
ik huilde bijna van het lachen﻿
OMG XD﻿
Суууууууууууууууупер﻿
We were ALL kids once! Adults are even suckier!﻿
True, but nows the time to teach him/her NOT to do that kinda stuff to kitty! :)
They learn quick!﻿
I agree! I saw another video like this and the 3 year old kid threw a cat into the pool! I would've whacked the parents upside the head and swatted the kid on the butt and tell him to NEVER ever be mean to any animal, especially throwing our kitty into water! Yeah, that "baby" looked like he was trying to drown the cat! The parents better watch out for that, it can lead to other things with animals that are hurtful to them! And he could have gotten badly scratched up too.﻿
Epic - Cats the song xD
/watch?v=Nxxu0i9UVMk﻿
The last two aren't really funny. the

# Reading the dataset

In [2]:
train_unique = yc.unique_videos_in_dataset('data/comedy_comparisons.train')
test_unique = yc.unique_videos_in_dataset('data/comedy_comparisons.test')
with open('train_unique', 'w') as t:
    t.write("\n".join(train_unique))
with open('test_unique', 'w') as t:
    t.write("\n".join(test_unique))

print("Found {0} train unique videos and {1} test videos.".format(len(train_unique), len(test_unique)))

Found 18474 train unique videos and 4263 test videos.


In [5]:
# Counting the number of unique and non private videos in the dataset
# This cell shows how it was initially run for demonstrational purposes

from tqdm import tqdm, tqdm_notebook, tnrange
from time import sleep

pbar = tqdm_notebook(range(len(train_unique)), desc="Downloading video")
train = yc.non_private_videos(train_unique, pbar)
test = yc.non_private_videos(test_unique, pbar)
with open('train_unique_non_private', 'w') as t:
    t.write("\n".join(train))
with open('test_unique_non_private', 'w') as t:
    t.write("\n".join(test))
print("There are {0} unique non-private videos in train and {1} unique non-private in test".format(len(train), len(test)))

KeyboardInterrupt: 

In [9]:
# Now create other train and test datasets without the private videos
import csv
def clean_dataset(keep_id_file, old_dataset, new_dataset):
    """
    Will read file old_dataset and write to new_dataset
    keeping all ids in keep_id_file
    """
    to_keep = {}
    with open(keep_id_file, "r") as f:
        for line in f:
            to_keep[line.strip()] = True
    # Now remove them
    with open(old_dataset) as old_file:
        with open(new_dataset, "w") as new_file:
            old_ds = csv.reader(old_file, delimiter=',')
            for row in old_ds:
                if (row[0] in to_keep) and (row[1] in to_keep):
                    new_file.write(",".join(row) + "\n")

def num_of_lines(file):
    with open(file, 'r') as f:
        return sum(1 for line in f)
            
clean_dataset('data/train_unique_non_private', 'data/comedy_comparisons.train', 'data/comedy_comparisons_clean.train')
print("Initial train dataset lines:", num_of_lines('data/comedy_comparisons.train'), "Clean dataset:", num_of_lines('data/comedy_comparisons_clean.train'))
clean_dataset('data/test_unique_non_private','data/comedy_comparisons.test', 'data/comedy_comparisons_clean.test')
print("Initial train dataset lines:", num_of_lines('data/comedy_comparisons.test'), "Clean dataset:", num_of_lines('data/comedy_comparisons_clean.test'))


Initial train dataset lines: 912969 Clean dataset: 874991
Initial train dataset lines: 225593 Clean dataset: 210750


In [11]:
# Select 30 videos at random to then measure the average time to download
import numpy as np
selected = []
file = './data/comedy_comparisons_clean.train'
unique = 0
with open(file) as csvfile:
    encountered = {}
    youtube_id = csv.reader(csvfile, delimiter = ',')
    for row in youtube_id:
        if row[0] not in encountered:
            if np.random.random_sample() < 0.5:
                selected += [row[0]]
                if len(selected) >= 30:
                    break
            encountered[row[0]] = True
            unique += 1
        if row[1] not in encountered:
            if np.random.random_sample() < 0.5:
                selected += [row[1]]
                if len(selected) >= 30:
                    break
            encountered[row[1]] = True
            unique += 1

selected



['y2emSXSE-N4',
 'HZPUQQNRvOg',
 'vX95JgKGu0o',
 'wu4SU70w7LA',
 '2FHH-9teZP0',
 'SG2B7dIqbNQ',
 'rhjIStU0JvI',
 'bfU4rTa-PfQ',
 'Lr4SS1zxRYg',
 'g6lJ2k3TCFg',
 'UPTrASEx_p8',
 'plieAqK2a00',
 'JmtITSojOF8',
 'sfQEd-lVHCE',
 'wY-1KNh7-NQ',
 'YWzqfaX4ofU',
 'ot0YhG58PiM',
 'dRZIWb_Nzwo',
 'KPP5NbjW9eI',
 'Uomk60wP2mI',
 'cvoOHcAlY2o',
 'dbEq8b3y8ws',
 '2HFTEykZ8mY',
 'ap9Fp8lBtMo',
 'qrrAB-wu_8s',
 'w8UWgufUIv0',
 'l1Sj3HSXGxw',
 'wHOraxZ3RKM',
 'NA3I4aZk0fQ',
 'PtTwGA6uVcY']

In [33]:
%%time
import csv
import sys
import numpy as np
#with open('./comedy_comparisons.train') as csvfile:
#    youtube_id_reader = csv.reader(csvfile, delimiter = ',')
#visited = {}
SIZE = 135
chunk = 1
unique_videos = []
with open('data/train_unique_non_private') as f:
    for l in f:
        unique_videos += [l.strip()]
selected = unique_videos[((chunk * SIZE) + 1):(((chunk + 1) * SIZE)+1)]

pbar = tqdm_notebook(range(SIZE), desc="Downloading comments")

i = 0
fieldnames = ['youtube_id', 'cid', 'text', 'time', 'author']
with open('data/dataset_movies.csv', 'a') as outputfile:
    writer = csv.DictWriter(outputfile, fieldnames=fieldnames)
    writer.writeheader()   
    for row in selected:
        #print(row)
        def download_video_comments(youtube_id):  
            global i
            global visited
            #if youtube_id not in visited:
            for comment in yc.download_comments(youtube_id):
                writer.writerow({'youtube_id': youtube_id, 'cid': comment['cid'], 'text': comment['text'], 'time': comment['time'], 'author': comment['author']})
                sleep(0.1)
                #try:
                #    print(str(youtube_id) + ':' + comment['text'])
                #    sys.stdout.flush()
                #except:
                #    pass

                #print(youtube_id)
                #visited[youtube_id] = True
               # i = i + 1
                #print(str(i) + "-" * 15 + "\n\n")

        download_video_comments(row)
        pbar.update(1)
        sleep(0.5)



            




ReadTimeout: HTTPSConnectionPool(host='www.youtube.com', port=443): Read timed out. (read timeout=None)

In [31]:
all(selected) != all(unique_videos[136:271])

False

In [22]:
unique_videos[136]

'1bXeQ7baYEE'

# Notes about progress
* All comments of all videos tested are downloaded successfully and can be saved in a CSV file
* However, many of the videos of the dataset are now private and must be cleaned
* Any rows containing private videos must be eliminated and then estimate the average time to download video comments (this code is already tested), to decide how many videos to use in the datasets train and test
* Finally create the final script to download the dataset

In [10]:
from time import sleep
from tqdm import tqdm, tqdm_notebook, tnrange

for i in tnrange(1000, desc="test loop"):
    sleep(0.01)


