In [1]:
import pandas as pd
import csv
import sqlite3
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import urllib
from scipy.stats import mannwhitneyu
import re
import os.path
from langdetect import detect
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import fastcluster
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import squareform
import sys
from scipy.cluster.hierarchy import fcluster, dendrogram
from nltk.stem.porter import PorterStemmer
import math
import gensim 
import hdbscan
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)
matplotlib.style.use('ggplot')

## YouTube - Exploratory Analyses
### Loading the dataset
First, let us load the dataset from the database.

In [3]:
con = sqlite3.connect('youtube.db')
video = pd.read_sql_query('''SELECT v.autoId as autoId, 
                                    v.id as id,
                                    v.categoryId as categoryId,
                                    v.channelId as channelId,
                                    v.publishedAt as publishedAt,
                                    v.title as title,
                                    v.description as description,
                                    v.viewCount as viewCount,
                                    v.likeCount as likeCount,
                                    v.dislikeCount as dislikeCount,
                                    v.favoriteCount as favoriteCount,
                                    v.commentCount as commentCount,
                                    v.duration as duration,
                                    v.defaultLanguage as defaultLanguage,
                                    c.title as channelTitle,
                                    c.description as channelDescription,
                                    c.publishedAt as channelPublishedAt,
                                    c.viewCount as channelViewCount,
                                    c.commentCount as channelCommentCount,
                                    c.subscriberCount as channelSubscriberCount,
                                    c.videoCount as channelVideoCount,
                                    c.country as channelCountry
                                    from video v left join channel c on v.channelId = c.id limit 130000''', con)

Next, let's examine the count of videos we collected. The tables are ``video``, ``url``, ``urlResolve``, and ``category``.

In [4]:
print video.shape

(130000, 22)


In [5]:
list(video.columns.values)

['autoId',
 'id',
 'categoryId',
 'channelId',
 'publishedAt',
 'title',
 'description',
 'viewCount',
 'likeCount',
 'dislikeCount',
 'favoriteCount',
 'commentCount',
 'duration',
 'defaultLanguage',
 'channelTitle',
 'channelDescription',
 'channelPublishedAt',
 'channelViewCount',
 'channelCommentCount',
 'channelSubscriberCount',
 'channelVideoCount',
 'channelCountry']

In [6]:
print video.shape

(130000, 22)


### Examining the affiliate video descriptions

What languages are these descriptions in?

In [7]:
def get_language(x):
    language = 'Unknown'
    try:
        language = detect(x.description.strip())
    except:
        pass
    return language

vids = video.apply(get_language, axis=1)
print vids.value_counts()

en         44354
Unknown    38227
es          6506
pt          5044
ru          4035
de          3323
ja          2737
fr          2730
ko          2194
ar          1936
it          1924
id          1566
tr          1155
ca          1067
ro          1033
vi          1027
nl           936
th           876
pl           874
tl           706
so           641
sv           549
et           500
af           479
no           478
da           421
hr           401
bg           394
cy           354
hu           343
fi           332
sw           330
sl           271
cs           244
el           224
he           214
zh-cn        188
uk           187
lt           184
sk           164
sq           158
zh-tw        143
mk           112
fa           107
bn            79
lv            78
hi            63
ur            30
ta            24
ne            15
ml            14
te            13
mr            11
pa             2
gu             2
kn             1
dtype: int64


In [8]:
videos_en = video[vids == 'en'].copy()

In [9]:
videos_en.shape

(44354, 22)

In [2]:
#could write to csv
# with open('videos_en.tsv', 'wb') as csvfile:
#     writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
#     count = 0
#     for index, row in videos_en.iterrows():
#         try:
#             writer.writerow([row["id"].encode('utf-8'), row["description"].encode('utf-8'), row["channelTitle"].encode('utf-8')])
#         except:
#             count += 1
#     print count

videos = []
with open("videos_en.tsv", 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        videos.append([row[0], row[1], row[2]])
        
videos_en = pd.DataFrame(videos, columns=['id', 'description', 'channelTitle'])
print videos_en.shape
videos = 0
        

(43653, 3)


# Cluster

In [3]:
stemmer = PorterStemmer()

def tokenize(line):
    if (line is None):
        line = ''
    printable = set(string.printable)
    line = ''.join(filter(lambda x: x in printable, line)) 
    stopwords = nltk.corpus.stopwords.words('english')

    tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]*\'[a-zA-Z]*|\w+')
    
    tokens = []
    
    line = re.sub(r'(http[s]?://|www.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]*|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))*', '', line).lower()
    tokens.extend(tokenizer.tokenize(line))
    
    tokens_ = [f.strip(string.punctuation) for f in tokens]
    tokens_ = [f for f in tokens_ if f != '' and f not in stopwords and len(f) != 1]
    tokens_ = [f for f in tokens_ if not (f.isdigit() or f[0] == '-' and f[1:].isdigit())]
    tokens_ = [stemmer.stem(f) for f in tokens_]

    return tokens_


In [4]:
# Add a sentence number value here
rows = []

def description_parse(desc):
    sentences = []
    for line in desc.splitlines():
        for sent in sent_tokenize(line):
            sentences.append(sent)
    
    return sentences

sentences = []
count = 0
for index, row in videos_en.iterrows():
    try:
        sentences = description_parse(row.description)
        for sent in sentences:
            if len(tokenize(sent)) != 0:
                rows.append([row['id'], row['channelTitle'], row['description'], sent])
    except:
        count+=1

videos_en_new = pd.DataFrame(rows, columns=['id', 'channelTitle', 'description', 'sentence'])
videos_en = [] #clear this from memory
print videos_en_new.shape
print count

(230749, 4)
871


In [9]:
videos_en_new1 = videos_en_new.copy().loc[:50000]
videos_en_new1.shape

(100001, 4)

In [10]:
countVec = CountVectorizer(tokenizer=tokenize, binary=True).fit(videos_en_new1['sentence'])

In [11]:
lineVec = countVec.transform(videos_en_new['sentence'])

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True).fit(lineVec)

In [None]:
HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, 
        gen_min_span_tree=True, leaf_size=40, memory=Memory(cachedir=None),
        metric='cosine', min_cluster_size=5, min_samples=None, p=None)

In [None]:
# tfIdfMatrix.shape

In [None]:
dist = cosine_distances(lineVec)
#pass tfIdfMatrix instead of lineVec
#look on sklearn

In [None]:
distVec = squareform(dist, checks = False)
dist = 0

In [None]:
res = fastcluster.linkage(distVec, method = 'ward', preserve_input = False)
distVec = 0

In [None]:
plt.figure(figsize=(35, 10))
plt.xlabel('Line Index')
plt.ylabel('Distance')
dendrogram(
    res,
    leaf_rotation=90.,  
    leaf_font_size=8.,  
)
plt.show()

In [None]:
videos_en_new['ward_cosine_cluster'] = fcluster(res, 5, criterion='distance')

In [None]:
with open('videos_en_new.tsv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    count = 0
    for index, row in videos_en.iterrows():
        try:
            writer.writerow([row["id"].encode('utf-8'), row["sentence"].encode('utf-8'), row["channelTitle"].encode('utf-8'), row.ward_cosine_cluster])
        except:
            count += 1
    print count

In [None]:
count = 0
for index, row in videos_en_new.iterrows():
    try:
        str(row.sentence).index("use code")
        print row.sentence
        print row.ward_cosine_cluster 
        print
        count += 1
    except:
        x = re.findall("[0-9]?[0-9]% off", row.sentence)
        if x:
            print row.sentence
            print row.ward_cosine_cluster
        x = re.findall("\$[0-9]?[0-9]", row.sentence)
        if x:
            print row.sentence
            print row.ward_cosine_cluster

# Check the clusters

In [None]:
# this needs to be changed on a re-run
cluster_numbers = [17]

In [None]:
# for index, row in videos_en_new.iterrows():
#     if row.ward_cosine_cluster in [677]:
#         print row.sentence


In [None]:
info = {}

for index, row in videos_en_new.iterrows():
    if row.ward_cosine_cluster in cluster_numbers:
        if row.id not in info:
             info[row.id] = [row.channelTitle, []]
        info[row.id][1].append(row.sentence)

for vidId in info.keys():
    try:
        print info[vidId][0]
        for item in info[vidId][1]:
            print "\t" + item
        print
    except:
        pass

In [None]:
countVec1 = CountVectorizer(tokenizer=tokenize, binary=True).fit_transform(videos_en_new['sentence'])

In [None]:
arr = countVec.get_feature_names()
for i in range(len(arr)):
    if arr[i] == "code":
        print i

In [None]:
arr1 = countVec1.toarray()
indices = []
for i in range(len(arr1)):
    if arr1[i][3270] == 1:
        print i
        indices.append(i)  

In [None]:
index_distances = []
distances_indices = []
for index, row in videos_en_new.iterrows():
    if row.ward_cosine_cluster in cluster_numbers and len(row.sentence) > 20:
        index_distances.append(arr1[index])
        distances_indices.append(index)

average = []
for i in range(len(index_distances[0])):
    average.append(0)
    for array in index_distances:
        average[i] += array[i]
    average[i] /= (len(index_distances) *1.0)

In [None]:
def get_array_from_sentence(sentence):
    ret_val = [0] * len(arr1[0])
    
    split = re.split('\W+', sentence)
    for word in split:
        if word in arr:
            ret_val[arr.index(word)] = 1
    return ret_val  

def get_total_distance(sentence):
    array = list(index_distances)
    array.append(get_array_from_sentence(sentence))
    return cosine_distances(array)

def get_closest(sentence):
    twod = get_total_distance(sentence)
    vals = twod[len(twod)-1]
    least = 100
    least_index = -1
    for i in range(len(vals)-1):
        if least >= vals[i]:
            least = vals[i]
            least_index = i
    print least, 
    print "\t",
    loc = distances_indices[least_index]
    print videos_en_new.loc[loc].sentence

get_closest("use my code in the chat please")
get_closest("use code \"turtle\" for 25% off at")
get_closest("get 20% using code \"turle\"")
get_closest("save $20 using code \"turle\"")
get_closest("I like turtles and potatoes")
get_closest("I like to code during my free time")

# Word Vectors

In [None]:
data = []

for index, row in videos_en_new.iterrows():
    if row.ward_cosine_cluster in cluster_numbers:
        sentence_array = []
        for word in word_tokenize(row.sentence):
            x = re.findall("\$?[0-9]?[0-9]%?", row.sentence)
            if x:
                sentence_array.append("15")
            else:
                sentence_array.append(word.lower())
        data.append(sentence_array)

# data.append()

In [None]:
len(data)

In [None]:
model = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5)

In [None]:
temp = ["use my code in the chat please", "use code \"turtle\" for 25% off at", "get 20% off using code \"turle\"",
 "save $20 using code \"turle\"", "I like turtles and potatoes", "I like to code during my free time"]

for sentence in temp:
    avg = 0
    count = 0
    words = word_tokenize(sentence)
    for i in range(len(words)-1):
        try:
            a = words[i]
            b = words[i+1]
            if re.findall("\$?[0-9]?[0-9]%?", a):
                a = "15"
            if re.findall("\$?[0-9]?[0-9]%?", b):
                b = "15"
            val = math.fabs(model.similarity(a, b))
            #print val
            avg += val
            count += 1
        except:
            pass
    if count > 0:
        print avg/count,
    else:
        print "\t",
    print "\t",
    print sentence

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]

In [None]:
model_doc = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [None]:
def check_sent(sent):
    new_sentence = sent.split(" ")  
    c = model_doc.infer_vector(new_sentence)
    print model_doc.docvecs.most_similar(positive=[c],topn=1)
    
for sent in temp:
    check_sent(sent)