##Extracting data from API

In [None]:
#import libraries
import numpy as np
import pandas as pd
import time
import requests
import json

In [None]:
#keys
api_key = 'YOUR_API_KEY'
video_link = 'https://www.youtube.com/watch?v=Ff4fRgnuFgQ'
video_id = 'Ff4fRgnuFgQ'

In [None]:
#make API call
url = "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet,replies&videoId="+video_id+"&key="+api_key+"&maxResults=50"
response = requests.get(url).json()

In [None]:
response

In [None]:
response['items'][0]
#we only need textDisplay to get the comments

In [None]:
textDisplay = response['items'][0]['snippet']['topLevelComment']['snippet']['textDisplay']
textDisplay

In [None]:
#we can retrive other data like author name, like count
authorDisplayName = response['items'][0]['snippet']['topLevelComment']['snippet']['authorDisplayName']
textDisplay = response['items'][0]['snippet']['topLevelComment']['snippet']['textDisplay']
likeCount = response['items'][0]['snippet']['topLevelComment']['snippet']['likeCount']
authorDisplayName, textDisplay, likeCount

In [None]:
#build a dataframe
df = pd.DataFrame(columns=["Author Name", "Comment", "Like Count"])

In [None]:
def get_comments(df, max_pages):
    pageToken = ''
    count = 0
    while count < max_pages:
        url = "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet,replies&videoId="+video_id+"&key="+api_key+"&maxResults=50&"+pageToken
        response = requests.get(url).json()
        time.sleep(1) #give it a second before starting the for loop
        for comment in response['items']:
            authorDisplayName = comment['snippet']['topLevelComment']['snippet']['authorDisplayName']
            textDisplay = comment['snippet']['topLevelComment']['snippet']['textDisplay']
            likeCount = comment['snippet']['topLevelComment']['snippet']['likeCount']

            df = df.append({'Author Name':authorDisplayName, 'Comment':textDisplay ,'Like Count':likeCount}, ignore_index=True)
        try:
            if response['nextPageToken'] != None:  #if none, it means it reached the last page and break out of it
                pageToken = "pageToken=" + response['nextPageToken']
        except:
            break
        count += 1
    return df

In [None]:
df = get_comments(df, 20) #we will collect data from 20 pages (20x50 comments)

In [None]:
df.sample(10)

Unnamed: 0,Author Name,Comment,Like Count
691,Steven McDaniel,That was a disappointment for a podcast. I tho...,1
982,KITT,the weights were not meant to be released but ...,0
452,sirmolio,"<a href=""https://www.youtube.com/watch?v=Ff4fR...",1
317,Pillow Pants,&quot;We have mini jiu jitsu tournaments in my...,0
776,Daniel Nelson,I&#39;m going to critique this from a big pict...,1
337,Akiff Hssn,1. Does a rise in AI Computation required incr...,0
164,E.C.,Taylor swift is a terrible singer.,0
187,R G,zuck&#39;s garage is the size of the white hou...,0
702,Kevin Star Official,"Zuckerberg is so fake, especially when discuss...",1
217,Delta Lame,I wanna see his booster seat 🤔... 😅,0


##Pre-processing

####Cleaning HTML&XHTML tags

In [None]:
from html.parser import HTMLParser

In [None]:
class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.result = []

    def handle_data(self, data):
        self.result.append(data)

def clean_html_tags(html):
    parser = MyHTMLParser()
    parser.feed(html)
    return ' '.join(parser.result)

In [None]:
df['Comment'] = df['Comment'].apply(clean_html_tags)
df['Author Name'] = df['Author Name'].apply(clean_html_tags)

####Decoding data

In [None]:
df['Comment'] = df['Comment'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

####Removal of Expressions

In [None]:
import re

In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df['Comment'] = df['Comment'].apply(remove_emojis)

####Apostrophe Lookup

In [None]:
APPOSTOPHES = {"'s": " is", "'m": " am", "'re": " are", "aren't": "are not",
               "can't": "cannot", "couldn't": "could not",
               "didn't": "did not", "doesn't": "does not",}
#we can use bigger dictionary for better results

def apost(txt):
    words = txt.split()
    reformed = [APPOSTOPHES[word] if word in APPOSTOPHES else word for word in words]
    reformed = " ".join(reformed)
    return reformed

In [None]:
df['Comment'] = df['Comment'].apply(lambda x: apost(x))

####Removal of Stop-words

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
def remove_stop_words(sentence):
    words = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
df['Comment'] = df['Comment'].apply(remove_stop_words)

####Split Attached words

In [None]:
def clean_format_text(text):
    cleaned = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    return cleaned

In [None]:
df['Comment'] = df['Comment'].apply(clean_format_text)

####Removing URLs

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.MULTILINE)
    return url_pattern.sub('', text)

In [None]:
df['Comment'] = df['Comment'].apply(remove_urls)

##Cleaned data

In [None]:
df

Unnamed: 0,Author Name,Comment,Like Count
0,Lex Fridman,timestamps . Please check sponsors support pod...,626
1,Michael Wojcicki,corp wants shutdown free speech GOV calls want...,0
2,Allen Han,"2000 # C main language . 2003 Scala Groovy , 2...",0
3,Charles Timmy Phillips jr,'m phone Timothy Allen cathy teeth,0
4,Alex Marcus,"Lex , relax outfit . going funeral ?",0
...,...,...,...
994,Bernios,'ve got hand Lex : manages talk influential pe...,39
995,George Heck,many umm try stop ! friendly advice past compe...,0
996,Scott T,first time 's ever seemed like actual human,0
997,Ellie Jo Bonney,666k views ... jus saying xxx,0


In [None]:
df.to_csv('youtube_vid_comments.csv')