In [1]:
import pandas as pd
import csv
import sqlite3
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import urllib
from scipy.stats import mannwhitneyu
import re
import os.path
from langdetect import detect
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
import fastcluster
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import squareform
import sys
from scipy.cluster.hierarchy import fcluster, dendrogram
from nltk.stem.porter import PorterStemmer
import math
import gensim 
import hdbscan
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)
matplotlib.style.use('ggplot')

## YouTube - Exploratory Analyses
### Loading the dataset
First, let us load the dataset from the database.

In [3]:
con = sqlite3.connect('youtube.db')
video = pd.read_sql_query('''SELECT v.autoId as autoId, 
                                    v.id as id,
                                    v.categoryId as categoryId,
                                    v.channelId as channelId,
                                    v.publishedAt as publishedAt,
                                    v.title as title,
                                    v.description as description,
                                    v.viewCount as viewCount,
                                    v.likeCount as likeCount,
                                    v.dislikeCount as dislikeCount,
                                    v.favoriteCount as favoriteCount,
                                    v.commentCount as commentCount,
                                    v.duration as duration,
                                    v.defaultLanguage as defaultLanguage,
                                    c.title as channelTitle,
                                    c.description as channelDescription,
                                    c.publishedAt as channelPublishedAt,
                                    c.viewCount as channelViewCount,
                                    c.commentCount as channelCommentCount,
                                    c.subscriberCount as channelSubscriberCount,
                                    c.videoCount as channelVideoCount,
                                    c.country as channelCountry
                                    from video v left join channel c on v.channelId = c.id limit 130000''', con)

Next, let's examine the count of videos we collected. The tables are ``video``, ``url``, ``urlResolve``, and ``category``.

In [4]:
print video.shape

(130000, 22)


In [5]:
list(video.columns.values)

['autoId',
 'id',
 'categoryId',
 'channelId',
 'publishedAt',
 'title',
 'description',
 'viewCount',
 'likeCount',
 'dislikeCount',
 'favoriteCount',
 'commentCount',
 'duration',
 'defaultLanguage',
 'channelTitle',
 'channelDescription',
 'channelPublishedAt',
 'channelViewCount',
 'channelCommentCount',
 'channelSubscriberCount',
 'channelVideoCount',
 'channelCountry']

In [6]:
print video.shape

(130000, 22)


### Examining the affiliate video descriptions

What languages are these descriptions in?

In [7]:
def get_language(x):
    language = 'Unknown'
    try:
        language = detect(x.description.strip())
    except:
        pass
    return language

vids = video.apply(get_language, axis=1)
print vids.value_counts()

en         44354
Unknown    38227
es          6506
pt          5044
ru          4035
de          3323
ja          2737
fr          2730
ko          2194
ar          1936
it          1924
id          1566
tr          1155
ca          1067
ro          1033
vi          1027
nl           936
th           876
pl           874
tl           706
so           641
sv           549
et           500
af           479
no           478
da           421
hr           401
bg           394
cy           354
hu           343
fi           332
sw           330
sl           271
cs           244
el           224
he           214
zh-cn        188
uk           187
lt           184
sk           164
sq           158
zh-tw        143
mk           112
fa           107
bn            79
lv            78
hi            63
ur            30
ta            24
ne            15
ml            14
te            13
mr            11
pa             2
gu             2
kn             1
dtype: int64


In [8]:
videos_en = video[vids == 'en'].copy()

In [9]:
videos_en.shape

(44354, 22)

In [2]:
#could write to csv
# with open('videos_en.tsv', 'wb') as csvfile:
#     writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
#     count = 0
#     for index, row in videos_en.iterrows():
#         try:
#             writer.writerow([row["id"].encode('utf-8'), row["description"].encode('utf-8'), row["channelTitle"].encode('utf-8')])
#         except:
#             count += 1
#     print count

# videos = []
# with open("videos_en.tsv", 'rb') as csvfile:
#     reader = csv.reader(csvfile, delimiter='\t')
#     for row in reader:
#         videos.append([row[0], row[1], row[2]])
        
# videos_en = pd.DataFrame(videos, columns=['id', 'description', 'channelTitle'])
# videos = 0
        

In [4]:
# Add a sentence number value here
rows = []

def description_parse(desc):
    sentences = []
    for line in desc.splitlines():
        for sent in sent_tokenize(line):
            sentences.append(sent)
    
    return sentences

sentences = []
count = 0
for index, row in videos_en.iterrows():
    try:
        sentences = description_parse(row.description)
        for sent in sentences:
            if len(tokenize(sent)) != 0:
                rows.append([row['id'], row['channelTitle'], row['description'], sent])
    except:
        count+=1

videos_en_new = pd.DataFrame(rows, columns=['id', 'channelTitle', 'description', 'sentence'])
del videos_en #clear this from memory
print videos_en_new.shape
print count

(230749, 4)
871


In [7]:
with open('sentences_en.tsv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    count = 0
    for index, row in videos_en_new.iterrows():
        try:
            writer.writerow([row["id"].encode('utf-8'), row["description"].encode('utf-8'), row["channelTitle"].encode('utf-8'), row["sentence"].encode('utf-8')])
        except UnicodeDecodeError:
            count += 1
    print count

75404
