In [1]:
import pandas as pd
import csv
import sqlite3
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import urllib
from scipy.stats import mannwhitneyu
import re
import os.path
from langdetect import detect
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
import fastcluster
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import squareform
import sys
from scipy.cluster.hierarchy import fcluster, dendrogram
from nltk.stem.porter import PorterStemmer
import math
import gensim 
import hdbscan
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)
matplotlib.style.use('ggplot')

## YouTube - Exploratory Analyses
### Loading the dataset
First, let us load the dataset from the database.

In [2]:
con = sqlite3.connect('youtube.db')
video = pd.read_sql_query('''SELECT v.autoId as autoId, 
                                    v.id as id,
                                    v.categoryId as categoryId,
                                    v.channelId as channelId,
                                    v.publishedAt as publishedAt,
                                    v.title as title,
                                    v.description as description,
                                    v.viewCount as viewCount,
                                    v.likeCount as likeCount,
                                    v.dislikeCount as dislikeCount,
                                    v.favoriteCount as favoriteCount,
                                    v.commentCount as commentCount,
                                    v.duration as duration,
                                    v.defaultLanguage as defaultLanguage,
                                    c.title as channelTitle,
                                    c.description as channelDescription,
                                    c.publishedAt as channelPublishedAt,
                                    c.viewCount as channelViewCount,
                                    c.commentCount as channelCommentCount,
                                    c.subscriberCount as channelSubscriberCount,
                                    c.videoCount as channelVideoCount,
                                    c.country as channelCountry
                                    from video v left join channel c on v.channelId = c.id''', con)

Next, let's examine the count of videos we collected. The tables are ``video``, ``url``, ``urlResolve``, and ``category``.

In [3]:
print video.shape

(515999, 22)


In [4]:
list(video.columns.values)

['autoId',
 'id',
 'categoryId',
 'channelId',
 'publishedAt',
 'title',
 'description',
 'viewCount',
 'likeCount',
 'dislikeCount',
 'favoriteCount',
 'commentCount',
 'duration',
 'defaultLanguage',
 'channelTitle',
 'channelDescription',
 'channelPublishedAt',
 'channelViewCount',
 'channelCommentCount',
 'channelSubscriberCount',
 'channelVideoCount',
 'channelCountry']

In [5]:
print video.shape

(515999, 22)


### Examining the affiliate video descriptions

What languages are these descriptions in?

In [6]:
def get_language(x):
    language = 'Unknown'
    try:
        language = detect(x.description.strip())
    except:
        pass
    return language

vids = video.apply(get_language, axis=1)
print vids.value_counts()

en         174699
Unknown    152337
es          26259
pt          20001
ru          16068
de          13311
fr          10916
ja          10804
ko           8359
ar           7724
it           7602
id           6305
tr           4588
vi           4210
ca           4182
ro           4163
nl           3797
pl           3586
th           3379
tl           2859
so           2554
sv           2136
et           2073
af           1917
no           1891
da           1637
hr           1598
bg           1561
cy           1483
fi           1381
sw           1342
hu           1331
cs           1059
sl           1048
el            898
he            831
sk            753
lt            716
zh-cn         695
uk            683
sq            595
zh-tw         555
mk            484
fa            415
lv            286
bn            268
hi            227
ta            101
ur             97
te             56
ml             51
ne             46
mr             40
pa             20
kn             14
gu        

In [7]:
videos_en = video[vids == 'en'].copy()

In [8]:
videos_en.shape

(174699, 22)

In [9]:
#could write to csv
with open('videos_en_1.tsv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    count = 0
    for index, row in videos_en.iterrows():
        iden = row["id"]
        if iden:
            iden = iden.encode('utf-8')
        desc = row["description"]
        if desc:
            desc = desc.encode('utf-8')
        chanTitle = row["channelTitle"]
        if chanTitle:
            chanTitle = chanTitle.encode('utf-8')
        writer.writerow([iden, desc, chanTitle])
#         except:
            
#             count += 1
    print count

# videos = []
# with open("videos_en.tsv", 'rb') as csvfile:
#     reader = csv.reader(csvfile, delimiter='\t')
#     for row in reader:
#         videos.append([row[0], row[1], row[2]])
        
# videos_en = pd.DataFrame(videos, columns=['id', 'description', 'channelTitle'])
# videos = 0
        

0


In [10]:
# Add a sentence number value here
rows = []

def description_parse(desc):
    sentences = []
    for line in desc.splitlines():
        for sent in sent_tokenize(line):
            sentences.append(sent)
    
    return sentences

sentences = []
count = 0
count1 = 0
for index, row in videos_en.iterrows():
    if not row.description:
        count1 += 1
    sentences = description_parse(row.description)
    for sent in sentences:
        if len(word_tokenize(sent)) != 0:
            rows.append([row['id'], row['channelTitle'], row['description'], sent])
    #except:
        #count+=1

videos_en_new = pd.DataFrame(rows, columns=['id', 'channelTitle', 'description', 'sentence'])
#del videos_en #clear this from memory
print videos_en_new.shape
print count1
print count

(1138630, 4)
0
0


In [11]:
with open('sentences_en1.tsv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    count = 0
    for index, row in videos_en_new.iterrows():
        iden = row["id"]
        if iden:
            iden = iden.encode('utf-8')
        desc = row["description"]
        if desc:
            desc = desc.encode('utf-8')
        chanTitle = row["channelTitle"]
        if chanTitle:
            chanTitle = chanTitle.encode('utf-8')
        sent = row["sentence"]
        if sent:
            sent = sent.encode('utf-8')
        try:
            writer.writerow([iden, desc, chanTitle, sent])
        except UnicodeDecodeError:
            count += 1
    print count

0
