In [39]:
import pandas as pd
from langdetect import detect
from textblob import TextBlob
import string
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("../data/GB_youtube_trending_data_cleaned.csv", parse_dates = ["publishedAt", "trending_date"])

In [80]:
df.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,...,Teaser,Official,Trailer,Music,Prix,Grand,Love,Video,iPhone,100
0,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11 16:34:06+00:00,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12 00:00:00+00:00,"['jacksepticeye', 'funny', 'funny meme', 'meme...",2038853,353790,...,0,0,0,0,0,0,0,0,0,0
1,9nidKH8cM38,TAXI CAB SLAYER KILLS 'TO KNOW HOW IT FEELS',2020-08-11 20:00:45+00:00,UCFMbX7frWZfuWdjAML0babA,Eleanor Neale,27,2020-08-12 00:00:00+00:00,"['eleanor', 'neale', 'eleanor neale', 'eleanor...",236830,16423,...,0,0,0,0,0,0,0,0,0,0
2,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands The ...,2020-08-11 17:00:10+00:00,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12 00:00:00+00:00,"['Apex Legends', 'Apex Legends characters', 'n...",2381688,146739,...,0,0,0,0,0,0,0,0,0,0
3,kgUV1MaD_M8,Nines - Clout (Official Video),2020-08-10 18:30:28+00:00,UCvDkzrj8ZPlBqRd6fIxdhTw,Nines,24,2020-08-12 00:00:00+00:00,"['Nines', 'Trapper of the year', 'Crop Circle'...",613785,37567,...,0,1,0,0,0,0,0,1,0,0
4,49Z6Mv4_WCA,i don't know what im doing anymore,2020-08-11 20:24:34+00:00,UCtinbF-Q-fVthA0qrFQTgXQ,CaseyNeistat,22,2020-08-12 00:00:00+00:00,,940036,87113,...,0,0,0,0,0,0,0,0,0,0


# Title Length

### Number of Characters

In [6]:
def get_title_length_chars(title):
    if pd.notna(title):
        return len(title)
    else:
        return 0

df['title_length_chars'] = df['title'].apply(get_title_length_chars)

### Number of Words

In [7]:
def get_title_length_words(title):
    if pd.notna(title):
        return len(title.split())
    else:
        return 0

df['title_length_words'] = df['title'].apply(get_title_length_words)

### Average Word Length

In [27]:
def average_word_length(title):
    if pd.notna(title):
        words = title.split()
        if words:
            total_length = sum(len(word) for word in words)
            return total_length / len(words)
    return 0

df['title_avg_word_length'] = df['title'].apply(average_word_length)

### Longest Word Length

In [29]:
def longest_word_length(title):
    if pd.notna(title):
        words = title.split()
        if words:
            return max(len(word) for word in words)
    return 0

df['title_longest_word_length'] = df['title'].apply(longest_word_length)

# Case

### All Words Uppercase

In [8]:
def all_words_are_uppercase(text):
    if pd.isna(text):
        return False
    words = text.split()
    return all(word.isupper() for word in words)  

df['title_all_upcase'] = df['title'].apply(all_words_are_uppercase).astype(int)

### First Word Uppercase

In [15]:
def first_word_is_uppercase(text):
    if pd.notna(text):
        words = text.split()
        if words:
            first_word = words[0]
            return first_word.isupper()
    return False

df['title_first_upcase'] = df['title'].apply(first_word_is_uppercase).astype(int)

### Any Word Uppercase

In [None]:
def any_word_is_uppercase(title):
    if pd.notna(title):
        words = title.split()
        return any(word.isupper() for word in words)
    return False

df['title_any_upcase'] = df['title'].apply(any_word_is_uppercase).astype(int)

### Proportion Words Uppercase

In [32]:
def proportion_words_uppercase(title):
    if pd.notna(title):
        words = title.split()
        if words:
            uppercase_words = [word for word in words if word.isupper()]
            return len(uppercase_words) / len(words)
    return 0.0

df['title_prop_upcase'] = df['title'].apply(proportion_words_uppercase)

### All Words Lowercase

In [17]:
def all_words_are_lowercase(text):
    if pd.isna(text):
        return False
    words = text.split()
    return all(word.islower() for word in words)  

df['title_all_lowercase'] = df['title'].apply(all_words_are_lowercase).astype(int)

# Sentiment

In [20]:
def identify_sentiment(text):
    if pd.isna(text):
        return -9    
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity < 0:
        return 0
    else:
        return -1

df['title_sentiment'] = df['title'].apply(identify_sentiment)

# Keywords

In [70]:
def remove_punctuation_within_word(word):
    return ''.join(char for char in word if char not in string.punctuation)

word_counts = df['title'].str.split(expand=True).stack().apply(remove_punctuation_within_word).value_counts()
frequent_words = word_counts[word_counts >= 1000].index
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
frequent_words = [word for word in frequent_words if word.lower() not in stop_words and word not in punctuation]

In [71]:
result_df = pd.DataFrame(columns = frequent_words)

for word in frequent_words:
    df['title'].fillna('', inplace=True) 
    word_mask = df['title'].str.lower().str.contains(word.lower())
    avg_views = df.loc[word_mask, 'view_count'].mean()
    if not result_df.empty:
        result_df = pd.concat([result_df, pd.DataFrame({'Word': [word], 'Average_Views': [avg_views]})], ignore_index=True)
    else:
        result_df = pd.DataFrame({'Word': [word], 'Average_Views': [avg_views]})

           Word  Average_Views
0                 2.168352e+06
1      Official   4.105751e+06
2         Video   3.472904e+06
3            vs   2.030661e+06
4    Highlights   1.427102e+06
..          ...            ...
141        Tour   1.284690e+06
142        FULL   1.866224e+06
143    OFFICIAL   4.105751e+06
144        Fury   7.908330e+05
145        Back   1.351698e+06

[146 rows x 2 columns]


In [78]:
result_df = result_df.sort_values(by='Average_Views', ascending = False)
result_df.head(20)

Unnamed: 0,Word,Average_Views
36,MV,15288760.0
108,BTS,12446170.0
91,Shorts,8218931.0
19,shorts,8218931.0
42,Teaser,5180543.0
1,Official,4105751.0
143,OFFICIAL,4105751.0
5,Trailer,3834790.0
8,Music,3604624.0
113,Prix,3555647.0


In [79]:
keywords = ['How', 'MV', 'BTS', 'Shorts', 'Teaser', 'Official', 'Trailer', 'Music', 'Prix', 'Grand', 'Love', 'Video', 'iPhone', '100']

def create_keyword_flags(df, keywords):
    for keyword in keywords:
        df[keyword] = df['title'].fillna('').str.contains(keyword, case = False).astype(int)
    return df

df = create_keyword_flags(df, keywords)

# Language

In [None]:
def identify_language(title):
    try:
        return detect(title)
    except:
        return 'Unknown'

#df['title_language'] = df['title'].apply(identify_language)

# Numbers

### Contains Digit

In [24]:
def contains_digit(text):
    if text is not None and not pd.isna(text):
        for char in text:
            if char.isdigit():
                return True
    return False

df['title_contains_digit'] = df['title'].apply(contains_digit).astype(int)

### Starts with Digit

In [25]:
def starts_with_digit(text):
    if text is not None and not pd.isna(text):
        return text[0].isdigit()
    return False

df['title_starts_digit'] = df['title'].apply(starts_with_digit).astype(int)

# Punctuation

### Is Title Question?

In [20]:
def identify_question_in_title(title):
    if pd.notna(title):
        return '?' in title
    return False

df['title_contains_question'] = df['title'].apply(identify_question_in_title).astype(int)

### Count Exclamations

In [34]:
def count_exclamation_marks(title):
    if pd.notna(title):
        return title.count('!')
    return 0

df['title_exclamation_count'] = df['title'].apply(count_exclamation_marks)

### Number of Punctuations

In [37]:
def count_punctuation_marks(title):
    if pd.notna(title):
        return sum(title.count(punctuation) for punctuation in string.punctuation)
    return 0

df['title_punctuation_count'] = df['title'].apply(count_punctuation_marks)

# Stop Words

### Number Stop Words

In [40]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def count_stop_words(title):
    if pd.notna(title):
        words = title.split()
        return sum(1 for word in words if word.lower() in stop_words)
    return 0

df['title_stop_words_count'] = df['title'].apply(count_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dilan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


### Proportion Stop Words

In [42]:
def proportion_stop_words(title):
    if pd.notna(title):
        words = title.split()
        if words:
            stop_word_count = sum(1 for word in words if word.lower() in stop_words)
            return stop_word_count / len(words)
    return 0.0

df['title_stop_words_prop'] = df['title'].apply(proportion_stop_words)

# Quotes

### Contains Quote

In [46]:
def contains_quote(title):
    if pd.notna(title):
        return (title.count('"') == 2) or (title.count("'") == 2)
    return False

df['title_contains_quote'] = df['title'].apply(contains_quote).astype(int)