# Youtube titles analysis 🎥
The purpose of this project is to analyze popular youtuber's titles and to find patterns that might be considered as guidelines for writing good titles. Database with around 3K titles scraped from around 20 successful youtubers that have at least +120K subscribers. Most channels have commentary/tech/programming content.

## Install and import necessary libraries

### Seaborn 
Seaborn is a library for making statistical graphics in Python. It builds on top of matplotlib and integrates closely with pandas data structures.

### Pandas
pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.

### Matplotlib
Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python. Matplotlib makes easy things easy and hard things possible.

### nltk 
NLTK, or the Natural Language Toolkit, is a popular Python library used in natural language processing (NLP). It's a powerful tool for human language data analysis, providing easy-to-use interfaces and a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning.

In [None]:
%pip install seaborn pandas matplotlib nltk

In [None]:
import sys
print(sys.executable)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.ticker import FuncFormatter
from collections import Counter 
import nltk
import string 

In [None]:
#  set color palette 
palette = sns.color_palette("flare", as_cmap=True)

In [None]:
# load a dataset
data = pd.read_csv("data_3.csv")


In [None]:
# Variable Relations with Total Views

plt.figure(figsize=(18,10))
ax = sns.scatterplot(data=data, x="title_length", y="views", hue="word_count", size="word_count")
# Define a formatter function that returns the number itself
def regular_formatter(x, pos):
    return f'{int(x)}'

# Set the formatter for the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(regular_formatter))

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Totals Views by Number of Characters Used in Title", fontsize=18)
plt.xlabel("Character Count", fontsize=10)
plt.ylabel("View Count", fontsize=10)

In [None]:
# Variable Relations with Total Views

cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
plt.figure(figsize=(20,10))
ax = sns.scatterplot(data=data, x="flesch_reading_ease", y="views", hue="stopword_count", size="stopword_count", palette=cmap, sizes=(10, 200))
ax.yaxis.set_major_formatter(FuncFormatter(regular_formatter))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Total Views by Flesch Reading Ease Score", fontsize=18)
plt.xlabel("Flesch Reasing Ease Score", fontsize=10)
plt.ylabel("Average View", fontsize=10)

In [None]:
# Variable Relations with Total Views

plt.figure(figsize=(20,10))
ax=sns.scatterplot(data=data, x=data.sentiment_polarity, y=data.views, color="#7d5189", hue="has_qmark")
ax.yaxis.set_major_formatter(FuncFormatter(regular_formatter))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Total Views by Sentiment Analysis", fontsize=18)
plt.xlabel("Sentiment Behind the Title", fontsize=10)
plt.ylabel("View Count", fontsize=10)

In [None]:
# Do popular videos have question marks?

data_qm = data.groupby("question_mark_count").agg({"views": "mean"}).reset_index()

plt.figure(figsize=(20,10))
ax = sns.barplot(data=data_qm, x="question_mark_count", y="views", hue="question_mark_count")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Do popular videos have question marks?", fontsize=16)
plt.xlabel("Contains Question Mark", fontsize=8)
plt.ylabel("Average Views", fontsize=8)
plt.xticks([0,1,2],["No QM","One QM","Two QMs"])


In [None]:
# Do popular videos have question marks?

data_has_qm = data.groupby("has_qmark").agg({"views": "mean"}).reset_index()

plt.figure(figsize=(20,10))
ax = sns.barplot(data=data_has_qm, x=data.has_qmark, y=data.views, hue=data.has_qmark)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Do popular videos have question marks?", fontsize=16)
plt.xlabel("Contains Question Mark", fontsize=8)
plt.ylabel("Average Views", fontsize=8)
plt.xticks([0,1],["No QM","Has QM"])

In [None]:
# Do popular videos have digits?

data_digits = data.groupby("digit_count").agg({"views": "mean"}).reset_index()

plt.figure(figsize=(32,8))
ax = sns.barplot(data=data_digits, x=data.digit_count, y=data.views, hue=data.digit_count)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Do popular videos have digits?", fontsize=20)
plt.xlabel("Contains Digits", fontsize=12)
plt.ylabel("Average Views", fontsize=12)

In [None]:
# Do popular videos have dollar signs?

data_has_currency = data.groupby("has_currency").agg({"views": "mean"}).reset_index()

plt.figure(figsize=(20,10))
ax = sns.barplot(data=data_has_currency, x=data.has_currency, y=data.views, hue=data.has_currency, palette="rocket")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Do popular videos have dollar signs?", fontsize=16)
plt.xlabel("Contains Dollar Sign", fontsize=8)
plt.ylabel("Average Views", fontsize=8)
plt.xticks([0,1],["No Dollar Sign","Has Dollar Sign"])

In [None]:
# Do popular videos contain digits?

data_has_digit = data.groupby("has_digit").agg({"views": "mean"}).reset_index()

plt.figure(figsize=(20,10))
ax = sns.barplot(data=data_has_digit, x=data.has_digit, y=data.views, hue=data.has_digit, palette="flare")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Do popular videos contain digits?", fontsize=16)
plt.xlabel("Contains Digits", fontsize=8)
plt.ylabel("Average Views", fontsize=8)
plt.xticks([0,1],["No Digits","Has Digits"])

In [None]:
# Do popular videos Contain more UpperCase Letter?

data_upper = data.groupby("uppercase_count").agg({"views": "mean"}).reset_index()
plt.figure(figsize=(20,10)) 
ax = sns.barplot(data=data_upper, x="uppercase_count", y="views", hue="uppercase_count")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Do popular videos contain Uppercase letters?", fontsize=16)
plt.xlabel("Uppercase Letter Count", fontsize=8)
plt.ylabel("Average Views", fontsize=8)

In [None]:
# What's the percentage of UpperCase Letters in each titles and popularity?

cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True)
data_upper = data.groupby("percentage_uppercase_words").agg({"views": "mean"}).reset_index()
plt.figure(figsize=(20,10)) 
ax = sns.barplot(data=data_upper, x="percentage_uppercase_words", y="views", hue="percentage_uppercase_words", palette=cmap)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Percentage of uppercase letters in each title and popularity", fontsize=16)
plt.xlabel("Uppercase Letter Percentage", fontsize=8)
plt.ylabel("Average Views", fontsize=8)

In [None]:
# The most common words in the titles

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
punctuation = string.punctuation

palette = sns.cubehelix_palette(12, start=.5, rot=-.75, reverse=True)
text = " ".join(data['title'])
text = text.lower()
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stop_words]
text = [t for t in text if not t.isdigit()]

x = [_[0] for _ in Counter(text).most_common(15)]
y = [_[1] for _ in Counter(text).most_common(15)]


plt.figure(figsize=(20,10));
ax = sns.barplot(x=y, y=x, palette=palette, hue=y, legend=False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Video Titles");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

In [None]:
# Keywords that get highest views 

top100 = Counter(text).most_common(100)

for t in top100:
    t = t[0]
    data[t] = data['title'].apply(lambda x : 1 if t.lower() in x.lower() else 0)
    
views_by_words = {}
for t in top100:
    t = t[0]
    doc = data.groupby(t).agg({"views" : "mean"}).to_dict()['views']
    if 1 in doc:
        views_by_words[t] = doc[1]
        
views_by_words = {k: v for k, v in sorted(views_by_words.items(), key=lambda item: item[1])}
x = [_ for _ in list(views_by_words.keys())][::-1]
y = [_ for _ in list(views_by_words.values())][::-1]
palette = sns.cubehelix_palette(n_colors=20, start=.5, rot=-.75, reverse=False)

plt.figure(figsize=(20,10));
ax = sns.barplot(x=y[:20], y=x[:20], palette=palette)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Keywords linked with higher number of views");
plt.xlabel("Average Views", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);
plt.ticklabel_format(style='plain', axis='x',useOffset=False)