In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import isodate
import urltools
import urllib
from scipy.stats import mannwhitneyu
import re
import os.path
from langdetect import detect
import string
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import fastcluster
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import squareform
import sys
from scipy.cluster.hierarchy import fcluster, dendrogram
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)
matplotlib.style.use('ggplot')

## YouTube - Exploratory Analyses
### Loading the dataset
First, let us load the dataset from the database.

In [2]:
con = sqlite3.connect('./youtube.db')
video = pd.read_sql_query('''SELECT v.autoId as autoId, 
                                    v.id as id,
                                    v.categoryId as categoryId,
                                    v.channelId as channelId,
                                    v.publishedAt as publishedAt,
                                    v.title as title,
                                    v.description as description,
                                    v.viewCount as viewCount,
                                    v.likeCount as likeCount,
                                    v.dislikeCount as dislikeCount,
                                    v.favoriteCount as favoriteCount,
                                    v.commentCount as commentCount,
                                    v.duration as duration,
                                    v.defaultLanguage as defaultLanguage,
                                    c.title as channelTitle,
                                    c.description as channelDescription,
                                    c.publishedAt as channelPublishedAt,
                                    c.viewCount as channelViewCount,
                                    c.commentCount as channelCommentCount,
                                    c.subscriberCount as channelSubscriberCount,
                                    c.videoCount as channelVideoCount,
                                    c.country as channelCountry
                                    from video v left join channel c on v.channelId = c.id''', con)
url = pd.read_sql_query('SELECT * from urlNumber', con)
urlResolve = pd.read_sql_query('SELECT * from urlResolve', con)
category = pd.read_sql_query('SELECT * from category', con)
channel = pd.read_sql_query('SELECT * from channel', con)
captions = pd.read_sql_query('SELECT * FROM captions', con)

Next, let's examine the count of videos we collected. The tables are ``video``, ``url``, ``urlResolve``, and ``category``.

In [3]:
video.shape

(515999, 22)

In [4]:
list(video.columns.values)

['autoId',
 'id',
 'categoryId',
 'channelId',
 'publishedAt',
 'title',
 'description',
 'viewCount',
 'likeCount',
 'dislikeCount',
 'favoriteCount',
 'commentCount',
 'duration',
 'defaultLanguage',
 'channelTitle',
 'channelDescription',
 'channelPublishedAt',
 'channelViewCount',
 'channelCommentCount',
 'channelSubscriberCount',
 'channelVideoCount',
 'channelCountry']

In [5]:
url.shape

(405471, 4)

In [6]:
list(url.columns.values)

['autoId', 'videoId', 'url', 'lineNumber']

In [7]:
urlResolve.shape

(660009, 5)

In [8]:
list(urlResolve.columns.values)

['autoId', 'urlId', 'url', 'urlOrder', 'code']

In [9]:
category.shape

(32, 3)

In [10]:
list(category.columns.values)

['autoId', 'id', 'key']

### Examining the videos
Next, let's examine the categories of the videos.

In [11]:
video['duration'] = video['duration'].map(lambda x: isodate.parse_duration(x).total_seconds())
video['duration'].describe()

count    515999.000000
mean        445.870275
std        1201.335379
min           0.000000
25%          61.000000
50%         172.000000
75%         366.000000
max       86400.000000
Name: duration, dtype: float64

### Manipulating the URLs

Add functions that dissect the URL to reveal the various parts.

In [12]:
def manipulate_urls(x):
    parsed_url = urltools.parse(x.url)
    
    sd = parsed_url.subdomain.lower()
    if sd.startswith('www.'):
        sd = sd[4:]
    
    return pd.Series([parsed_url.domain.lower(), 
                      urllib.unquote(parsed_url.path).lower(),
                      sd,
                      urllib.unquote(parsed_url.query).lower()])

In [13]:
list(urlResolve.columns.values)

['autoId', 'urlId', 'url', 'urlOrder', 'code']

In [14]:
urlResolve[['urlDomain', 'urlPath', 'urlSubDomain', 'urlParams']] = urlResolve.apply(manipulate_urls, axis=1)

Mark all the following patterns as affiliate links.

In [25]:
def check_affiliate_link(x):
    params_list = map(lambda y: y.split('=')[0], x.urlParams.split('&'))
    regexp_clickbank = re.compile(r'.*.hop')
    regexp_anrdoezrs = re.compile(r'/click-[0-9]+-[0-9]+')
    regexp_buyeasy_1 = re.compile(r'/cashback.*')
    regexp_buyeasy_2 = re.compile(r'/redirect.*')
    regexp_admitad_1 = re.compile(r'/g/.*')
    regexp_admitad_2 = re.compile(r'/goto/.*')
    regexp_impactradius = re.compile(r'/c/[0-9]+/[0-9]+/[0-9]+')
    regexp_pepperjam = re.compile(r'/t/[0-9]-[0-9]+-[0-9]+-[0-9]+')
    
    if ((x.urlDomain == 'affiliaxe' and 'aff_id' in params_list) or
        (x.urlDomain == 'aliexpress' and 'af' in params_list) or
        #(x.urlDomain == 'amazon' and 'tag' in params_list) or
        (x.urlDomain == 'apessay' and 'rid' in params_list) or
        #(x.urlDomain == 'apple' and 'at' in params_list) or
        (x.urlDomain == 'audiojungle' and 'ref' in params_list) or
        (x.urlDomain == 'awin1' and 'awinaffid' in params_list) or
        (x.urlDomain == 'zanox' and x.urlPath == '/ppc') or
        (x.urlDomain == 'zenaps' and x.urlPath == '/rclick.php') or
        (x.urlDomain == 'banggood' and 'p' in params_list) or
        (x.urlDomain == 'bookdepository' and 'a_aid' in params_list) or
        (x.urlDomain == 'booking' and 'aid' in params_list) or
        (x.urlDomain == 'ebay' and 'campid' in params_list) or
        (x.urlDomain == 'envato' and 'ref' in params_list) or
        (x.urlDomain == 'gtomegaracing' and 'tracking' in params_list) or
        (x.urlDomain == 'hotellook' and 'marker' in params_list) or
        (x.urlDomain == 'hotmart' and 'a' in params_list) or
        (x.urlDomain == 'kontrolfreek' and 'a_aid' in params_list) or
        (x.urlDomain == 'shareasale' and x.urlPath == '/r.cfm') or
        (x.urlDomain == 'shareasale' and x.urlPath == '/m-pr.cfm') or
        (x.urlDomain == 'shareasale' and x.urlPath == '/u.cfm') or
        (x.urlDomain == 'rstyle') or
        (x.urlDomain == 'shopstyle') or
        (x.urlDomain == 'makeupgeek' and 'acc' in params_list) or
        (x.urlDomain == 'olymptrade' and 'affiliate_id' in params_list) or
        (x.urlDomain == 'videohive' and 'ref' in params_list) or
        (x.urlDomain == 'avantlink' and 'pw' in params_list) or
        (x.urlDomain == 'avangate' and 'AFFILIATE'.lower() in params_list) or
        (x.urlDomain == 'redirectingat' and 'id' in params_list) or
        (x.urlDomain == 'linksynergy' and x.urlSubDomain == 'click' and 'id' in params_list) or
        (x.urlDomain == 'audiobooks' and 'a_aid' in params_list and 'a_bid' in params_list) or
        (x.urlDomain == 'buyeasy' and regexp_buyeasy_1.search(x.urlPath)) or
        (x.urlDomain == 'buyeasy' and regexp_buyeasy_2.search(x.urlPath)) or
        (x.urlDomain == 'clickbank' and regexp_clickbank.search(x.urlSubDomain)) or
        
        ((x.urlDomain == '7eer' or x.urlDomain == 'evyy' or x.urlDomain == 'ojrq' ) and regexp_impactradius.search(x.urlPath)) or
        
        ((x.urlDomain == 'anrdoezrs' or x.urlDomain == 'dpbolvw' or x.urlDomain == 'kqzyfj' or x.urlDomain == 'jdoqocy' or x.urlDomain == 'tkqlhce') and regexp_anrdoezrs.search(x.urlPath)) or
        (x.urlDomain == 'emjcd') or
        (x.urlDomain == 'dotomi') or
        (x.urlDomain == 'qksrv') or
        
        (x.urlDomain == 'zaful' and 'lkid' in params_list) or
        (x.urlDomain == 'codecanyon' and 'ref' in params_list) or
        (x.urlDomain == 'graphicriver' and 'ref' in params_list) or
        (x.urlDomain == 'themeforest' and 'ref' in params_list) or
        (x.urlDomain == 'admitad' and (regexp_admitad_1.search(x.urlPath) or regexp_admitad_2.search(x.urlPath))) or
        (x.urlDomain == 'flipkart' and 'affid' in params_list) or
        
        ((x.urlDomain == 'pntra' or
        x.urlDomain == 'gopjn' or
        x.urlDomain == 'pjtra' or
        x.urlDomain == 'pjatr' or
        x.urlDomain == 'pntrs' or
        x.urlDomain == 'pntrac') and (regexp_pepperjam.search(x.urlPath)))
       ):
        
            return True
        
    return False

urlResolve['affiliateLink'] = urlResolve.apply(check_affiliate_link, axis=1)

How many of these resolved links were affiliate in nature?

In [26]:
urlResolve['affiliateLink'].value_counts()

False    651875
True       8134
Name: affiliateLink, dtype: int64

How did these resolved affiliate links vary by domain?

In [27]:
affiliate_urls = url[url['autoId'].isin(urlResolve[urlResolve['affiliateLink'] == True].urlId.tolist())]

How many URLs were retained?

In [28]:
affiliate_urls.shape

(4503, 4)

How many videos do these URLs correspond to?

In [29]:
affiliate_videos = video[video['id'].isin(affiliate_urls['videoId'].tolist())].copy()
affiliate_videos.shape

(1717, 22)

#### Affiliate video duration
How do the duration of the affiliate videos vary?

In [30]:
affiliate_videos['duration'].describe()

count     1717.000000
mean       335.694234
std        654.663628
min          0.000000
25%         75.000000
50%        146.000000
75%        379.000000
max      11718.000000
Name: duration, dtype: float64

In [31]:
def get_language(x):
    language = 'Unknown'
    try:
        language = detect(x.description.strip())
    except:
        pass
    return language

In [33]:
i = 0
count = 0
import random
while i < affiliate_videos.shape[0]:
    #random_index = random.randint(0, affiliate_videos.shape[0])
    random_index = i
    i+=1
    vid = affiliate_videos.iloc[random_index]
    if vid.duration > 300 or len(vid.description) > 5000 or vid.viewCount < 5000:
        continue
    print "https://www.youtube.com/watch?v=" + str(vid.id)
    print vid.description[0:200]
    print "_______________________________________________________________________________"
    count+=1
print count

https://www.youtube.com/watch?v=C4bI-liLXws
👉Покупал Спиннер тут: http://ali.pub/1kcdmo
 💲 КЭШБЭК для Алиэкспресс: http://bit.ly/2hFzVY9
✅ Моя партнерка на YouTube: https://goo.gl/5HrRwB
☑️ Я ВКонтакте: https://vk.com/alexboykochannel
-------
_______________________________________________________________________________
https://www.youtube.com/watch?v=5btF-Re4K38
How to replace front brake pads on a Suzuki Ltz400
Front Brake Pads Replacement on a Suzuki LTZ 400 LTZ400
This is a tutorial video that shows you how to replace front brake pads on a Suzuki LTZ 400 a
_______________________________________________________________________________
https://www.youtube.com/watch?v=m_MUY-StJk0
Battlefield 4 (BF4) M16A4 loadout and military weapon setup to represent the USMC. Let's take a look at the  attachments and accessories used in real life to role play as this military force.
________
_______________________________________________________________________________
https://www.youtube.com/wat

https://www.youtube.com/watch?v=Gdu9-91n2m0
►India To Europe Trip 2016 All Videos In One Playlist - https://goo.gl/TtFt0Z

1. Oman Air - Bengaluru To Muscat - https://youtu.be/3o0GQArEjYQ
2. Oman Air - Muscat To London - https://youtu.be/KTsJCm
_______________________________________________________________________________
https://www.youtube.com/watch?v=uF7N-hRzrOU
sony xb50ap sound quality test Part -2 - https://www.youtube.com/watch?v=9kZbJhJAuGo

A Short and Quick video for sony xb 450 ap ( sony mdr xb450ap ) model headphones with mic from flipkart and also s
_______________________________________________________________________________
https://www.youtube.com/watch?v=zpkL-hgTVPk
Subscribe Now & Add Us As A Friend
http://www.MoTv1.com
http://FreeMaxoderm.com
http://SmokinOrJokin.com

All videos received that are broad casted through the MoTV1 broadcasting arms ( internet,
_______________________________________________________________________________
https://www.youtube.com/watch?

73
