# Offline TV Project

*Author: Andrew Pan*

**Reading in OfflineTV Channel Video Statistics**

Source: [SocialBlade OfflineTV](https://socialblade.com/youtube/channel/UCDK9qD5DAQML-pzrtA7A4oA/videos)



In [None]:
#imports
import csv
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from pandas.plotting import register_matplotlib_converters
import nltk

In [None]:
#function to get statistics of the video
def get_stats(url):
    #get url
    page = requests.get(url)
    ary = []
    #get title
    title = re.search("property=\"og:title\" content=\"([^\n]*)", page.text).group(1)
    title = title[:-2]
    ary.append(title)
    #get thumbnail
    image = re.search("property=\"og:image\" content=\"([^\n]*)", page.text).group(1)
    image = image[:-2]
    ary.append(image)
    #get tags
    tags = re.findall("property=\"og:video:tag\" content=\"([^\n]*)", page.text)
    newtags = []
    for x in tags:
        newtags.append(x[:-2])
    tag = ",".join(newtags)
    ary.append(newtags)
    #get upload date
    start_date = page.text.index('uploadDate')
    stop_date = page.text.index('trackingParams', start_date)
    date = page.text[start_date:stop_date]
    date = date[15:-7]
    ary.append(date)
    #get description
    start_desc = page.text.index('shortDescription')
    stop_desc = page.text.index('isCrawlable')
    desc = page.text[start_desc:stop_desc]
    desc = desc[20:-5]
    ary.append(desc)
    #get view count
    start_view = page.text.index('viewCount')
    stop_view = page.text.index('author')
    views = page.text[start_view:stop_view]
    views = views[14:-5]
    ary.append(views)
    #get length of video
    start_time = page.text.index('lengthSeconds')
    stop_time = page.text.index('keywords')
    time = page.text[start_time:stop_time]
    time = time[18:-5]
    ary.append(time)
    #get number of likes
    start_like = page.text.index('iconType\":"LIKE\"')
    stop_like = page.text.index("likes", start_like)
    like = page.text[start_like:stop_like]
    like = like[80:]
    ary.append(like)
    #get number of dislikes
    start_dislike = page.text.index('iconType\":"DISLIKE\"')
    stop_dislike = page.text.index("dislikes", start_like)
    dislike = page.text[start_dislike:stop_dislike]
    dislike = dislike[83:]
    ary.append(dislike)

    return ary


In [None]:
url_list = ["https://www.youtube.com/watch?v=anFz_6oRwI8&t=77s", "https://www.youtube.com/watch?v=WdtaiyxMsbs", 
            "https://www.youtube.com/watch?v=kET40pMzaYE", "https://www.youtube.com/watch?v=_P6YOzYhlmA",
           "https://www.youtube.com/watch?v=mJ76Ej4KsX0", "https://www.youtube.com/watch?v=4oyVsqIbMgc",
           "https://www.youtube.com/watch?v=o0nD40qMHHw", "https://www.youtube.com/watch?v=UQfNiiJgmno",
           "https://www.youtube.com/watch?v=qz64WP9Wb94", "https://www.youtube.com/watch?v=bFVF4n-JAFA",
           "https://www.youtube.com/watch?v=5EaE3kN7nUY", "https://www.youtube.com/watch?v=Ap5cnkclUsA",
           "https://www.youtube.com/watch?v=AuZJlroSSHY", "https://www.youtube.com/watch?v=P52mr4xRTh8",
           "https://www.youtube.com/watch?v=llAnpFV2W4g", "https://www.youtube.com/watch?v=Efe7oLo1sKc",
           "https://www.youtube.com/watch?v=cZgMiuIEA8E", "https://www.youtube.com/watch?v=1buDltZ6yzU",
           "https://www.youtube.com/watch?v=pfeKQMlFWNo", "https://www.youtube.com/watch?v=ft6W28g2KpE",
           "https://www.youtube.com/watch?v=4ScktvmY4GQ", "https://www.youtube.com/watch?v=GSgnFUPZOrI",
           "https://www.youtube.com/watch?v=nD40bUeU8OM", "https://www.youtube.com/watch?v=GfFVJyDVZuQ",
           "https://www.youtube.com/watch?v=4ZmrbPVqpNo", "https://www.youtube.com/watch?v=63oUHCoADLg",
           "https://www.youtube.com/watch?v=hVgcacw9hTE", "https://www.youtube.com/watch?v=DoS9Wjp1D9M",
           "https://www.youtube.com/watch?v=Ws6beaoG1mw", "https://www.youtube.com/watch?v=yZKMh2OejdY",
           "https://www.youtube.com/watch?v=99lyzSWePSw", "https://www.youtube.com/watch?v=1bOUY9KWwbc"]

data = []
for url in url_list:
    data.append(get_stats(url))
    

In [None]:
data.insert(0, ["title", "thumbnail", "tags", "upload-date", "description", "view-count", "duration", "likes", "dislikes"])

with open('otv_data.csv', 'w', encoding = 'utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)
    


In [None]:
#reads data from csv file
otv_df = pd.read_csv('otv_data.csv')

## Does View Count Grow as Time Goes On?

### According to this plot, the number of views is seemingly unrelated to how long this channel exists.
![Plot](graphs/views_over_time.png)

**Using linear regression to predict the number of views on the next video would not be effective.**

*run the cells below to get the plot*

In [None]:
register_matplotlib_converters()

#convert upload date to datetime
otv_df['upload-date'] = pd.to_datetime(otv_df['upload-date'])

#plotting number of views as a function of time
fig = plt.figure(figsize=(18,10))
plt.scatter(otv_df['upload-date'], otv_df['view-count'])
plt.suptitle('Number of Views by Date', fontsize=26)
plt.xlabel('Date', fontsize=16)
plt.ylabel('Number of Views', fontsize=16)
plt.show()


## Does View Count vary with the Duration of the Video?

### According to this plot...
![Plot](graphs/views_for_seconds.png)

**no.**  
what's next? likes, dislikes, comments, etc.  
  
*run the cells below for the plot*

In [None]:
#plotting view count with respect to the duration of the video
fig = plt.figure(figsize=(18,10))
plt.scatter(otv_df['duration'], otv_df['view-count'])
plt.suptitle('Number of Views in relation to Duration', fontsize=26)
plt.xlabel('Duration(seconds)', fontsize=16)
plt.ylabel('Number of Views', fontsize=16)
plt.show()


In [None]:
#making new columns, like and dislike ratios
#removing commas from numbers
otv_df['likes'] = otv_df['likes'].apply(lambda x: x.replace(",",""))
otv_df['dislikes'] = otv_df['dislikes'].apply(lambda x: x.replace(",",""))

## Here are some statistics about the ratings.
### According to plots regarding number of likes and dislikes by video...
**There isn't any relationship between number of likes, dislikes, or total ratings and date uploaded.**  
However, there appears to be a slight positive relationship between the percentage of raters/viewers and date uploaded  
![Plot](graphs/rate_ratio_over_time.png)    
Equation of the Line: $y = (5.042e-07)x^2 - 0.7436x + (2.742e+05)$  
Here are some general statistics about likes/dislikes
- Average Like Percentage: {{like_avg}}%
- Average Dislike Percentage {{dislike_avg}}%  
  
*run the cells below for the plots and statistics* 

In [None]:
#converting strings to ints
otv_df['likes'] = pd.to_numeric(otv_df['likes'])
otv_df['dislikes'] = pd.to_numeric(otv_df['dislikes'])
#new columns
otv_df['like-ratio'] = otv_df['likes'] / (otv_df['likes'] + otv_df['dislikes'])
otv_df['dislike-ratio'] = 1 - otv_df['like-ratio']
otv_df['ratings'] = otv_df['likes'] + otv_df['dislikes']
otv_df['rate-ratio'] = otv_df['ratings'] / otv_df['view-count']
#replace &#39; with '
otv_df['title'] = otv_df['title'].apply(lambda x: x.replace("&#39;","'"))
otv_df['title'] = otv_df['title'].apply(lambda x: x.replace("&amp;","&"))


In [None]:
#plot number of likes with relation to time
fig = plt.figure(figsize=(9,6))
plt.scatter(otv_df['upload-date'], otv_df['likes'])
plt.suptitle('Likes by Date', fontsize=26)
plt.xlabel('Date', fontsize=16)
plt.ylabel('Likes', fontsize=16)
plt.show()

In [None]:
#plot number of dislikes with relation to time
fig = plt.figure(figsize=(9,6))
plt.scatter(otv_df['upload-date'], otv_df['dislikes'])
plt.suptitle('Dislikes by Date', fontsize=26)
plt.xlabel('Date', fontsize=16)
plt.ylabel('Dislikes', fontsize=16)
plt.show()


In [None]:
#plot number of ratings with relation to time
fig = plt.figure(figsize=(9,6))
plt.scatter(otv_df['upload-date'], otv_df['ratings'])
plt.suptitle('Ratings by Date', fontsize=26)
plt.xlabel('Date', fontsize=16)
plt.ylabel('Ratings', fontsize=16)
plt.show()

In [None]:
#perform polynomial fit on the datapoints
otv_df['upload-date-num']=otv_df['upload-date'].map(datetime.datetime.toordinal)
trend = np.polyfit(otv_df['upload-date-num'].to_numpy(), otv_df['rate-ratio'].to_numpy(), 2)
trendpoly = np.poly1d(trend) 
#plot ratio of likes with relation to time
fig = plt.figure(figsize=(12,8))
plt.scatter(otv_df['upload-date'], otv_df['rate-ratio'])
plt.suptitle('Percentage of Viewers that Rate by Date', fontsize=26)
plt.xlabel('Date', fontsize=16)
plt.ylabel('Percentage of Ratings', fontsize=16)
plt.plot(otv_df['upload-date-num'], trendpoly(otv_df['upload-date-num']))
plt.show()
fig.savefig('graphs/rate_ratio_over_time.png')


In [None]:
#calculate statistics
like_avg = otv_df['like-ratio'].mean() * 100
like_avg = str(round(like_avg, 2))
dislike_avg = otv_df['dislike-ratio'].mean() * 100
dislike_avg = str(round(dislike_avg, 2))

## Lexical Statistics in Titles and Tags  

*NLTK: Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.*

**Most Common Words in Title:**  
{{printDict(common_title)}}

**Most Common Words in Tags:**  
{{printDict(common_tag)}}  

**Lexical Diversity**  
*Percent of Unique Words in Title:* **{{unique_title}}%**  
*Percent of Unique Words in Tags:* **{{unique_tag}}%**

In [None]:
#frequency of words in title
titles = []
#create list with all words in title
for title in otv_df['title']:
    title_list = title.split()
    for word in title_list:
        titles.append(word)
#frequency distribution of words in title
fddist_titles = nltk.FreqDist(titles)
common_title = fddist_titles.most_common(10)
#frequency of words in tags
tags = []
#create list with all words in tags
for tag in otv_df['tags']:
    tag_list = tag.split(',')
    for word in tag_list:
        tags.append(word)
#frequency distribution of words in tags
fddist_tags = nltk.FreqDist(tags)
common_tag = fddist_tags.most_common(20)

In [None]:
#function to print out a dictionary 
def printDict(dictionary):
    for key, value in dictionary:
        print("*" + key + "*" + " : **" + str(value) + "**  ")


In [None]:
#each unique word in titles
words_titles = sorted(set(titles))
words_titles = [x.strip(')') for x in words_titles]
words_titles = [x.strip('(') for x in words_titles]
words_titles = [x.replace('(', '') for x in words_titles]
words_titles = [x.replace(')', '') for x in words_titles]
words_titles = set(word.lower() for word in words_titles if word.isalpha())
#each unique tag
words_tags = sorted(set(tags))
words_tags = set(word.lower() for word in words_tags if word.isalpha())

In [None]:
#calculating the lexical diversity
#of title
unique_title = (len(words_titles) / len(titles)) * 100
unique_title = round(unique_title, 2)
#of tags
unique_tag = (len(words_tags) / len(tags)) * 100
unique_tag = round(unique_tag, 2)