##Import libraries and data

In [None]:
pip install vaderSentiment

In [None]:
import numpy as np
import pandas as pd
import spacy
import string
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('youtube_vid_comments.csv')

In [None]:
df.head()

Unnamed: 0,Author Name,Comment,Like Count
0,Lex Fridman,timestamps . Please check sponsors support pod...,626
1,Michael Wojcicki,corp wants shutdown free speech GOV calls want...,0
2,Allen Han,"2000 # C main language . 2003 Scala Groovy , 2...",0
3,Charles Timmy Phillips jr,m phone Timothy Allen cathy teeth,0
4,Alex Marcus,"Lex , relax outfit . going funeral ?",0


In [None]:
df = df.dropna(how='any')

##Pre-processing

We already cleaning data to a certain extent after extraxting from API

####Tokenization

In [None]:
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [None]:
df['tokens'] = df['Comment'].apply(tokenize_text)

####Lowercasting

In [None]:
df['lowercase_tokens'] = df['tokens'].apply(lambda tokens: [token.lower() for token in tokens])

####Lemmatization

In [None]:
def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

In [None]:
df['lemmatized_tokens'] = df['lowercase_tokens'].apply(lemmatize_tokens)

####Stopword Removal

In [None]:
def remove_stopwords(tokens):
    doc = nlp(" ".join(tokens))
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    return filtered_tokens

In [None]:
df['filtered_tokens'] = df['lemmatized_tokens'].apply(remove_stopwords)

####Punctuation Removal


In [None]:
def remove_punctuation(tokens):
    no_punct_tokens = [token for token in tokens if token not in string.punctuation]
    return no_punct_tokens

In [None]:
df['no_punct_tokens'] = df['filtered_tokens'].apply(remove_punctuation)

####Word frequency count

In [None]:
all_tokens = [token for tokens_list in df['no_punct_tokens'] for token in tokens_list]
word_freq = Counter(all_tokens)
print(word_freq)

###Sentiment Analysis

In [None]:
analyzer = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

In [None]:
df['sentiment_scores'] = df['Comment'].apply(analyze_sentiment)

In [None]:
columns_to_drop = ['Like Count', 'tokens', 'lowercase_tokens', 'lemmatized_tokens', 'filtered_tokens', 'no_punct_tokens']
df = df.drop(columns=columns_to_drop)

In [None]:
df

Unnamed: 0,Author Name,Comment,sentiment_scores
0,Lex Fridman,timestamps . Please check sponsors support pod...,"{'neg': 0.033, 'neu': 0.809, 'pos': 0.158, 'co..."
1,Michael Wojcicki,corp wants shutdown free speech GOV calls want...,"{'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compou..."
2,Allen Han,"2000 # C main language . 2003 Scala Groovy , 2...","{'neg': 0.025, 'neu': 0.906, 'pos': 0.07, 'com..."
3,Charles Timmy Phillips jr,m phone Timothy Allen cathy teeth,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,Alex Marcus,"Lex , relax outfit . going funeral ?","{'neg': 0.219, 'neu': 0.526, 'pos': 0.254, 'co..."
...,...,...,...
994,Bernios,ve got hand Lex : manages talk influential per...,"{'neg': 0.0, 'neu': 0.654, 'pos': 0.346, 'comp..."
995,George Heck,many umm try stop ! friendly advice past compe...,"{'neg': 0.073, 'neu': 0.629, 'pos': 0.298, 'co..."
996,Scott T,first time 's ever seemed like actual human,"{'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'comp..."
997,Ellie Jo Bonney,666k views ... jus saying xxx,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


##Classify sentiment category

In [None]:
def classify_sentiment(scores):
    if scores['pos'] > scores['neg'] and scores['pos'] > scores['neu']:
        return 'positive'
    elif scores['neg'] > scores['pos'] and scores['neg'] > scores['neu']:
        return 'negative'
    else:
        return 'neutral'

In [None]:
df['sentiment_category'] = df['sentiment_scores'].apply(classify_sentiment)
sentiment_counts = df['sentiment_category'].value_counts()
print(sentiment_counts)

neutral     853
positive     91
negative     41
Name: sentiment_category, dtype: int64
