# Importing Basic libraries

In [69]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [70]:
df = pd.read_csv('./YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [71]:
df.isnull().sum()

Comment      44
Sentiment     0
dtype: int64

In [72]:
df.dropna(inplace=True)

In [73]:
print(f"Percent of duplication in the dataset: {round(df.duplicated().sum() / len(df), 3) * 100}%")

Percent of duplication in the dataset: 2.7%


Since the duplication is less than 3% of the original dataset, we can just drop it

In [74]:
df.drop_duplicates(inplace=True)

In [75]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [76]:
df['Sentiment'].value_counts()

Sentiment
positive    11054
neutral      4503
negative     2317
Name: count, dtype: int64

In [77]:
df['Sentiment'].replace({'positive': 1, 
                         'negative': -1,
                         'neutral': 0}, inplace=True)

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sentiment'].replace({'positive': 1,
  df['Sentiment'].replace({'positive': 1,


Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,0
1,here in nz 50 of retailers don’t even have con...,-1
2,i will forever acknowledge this channel with t...,1
3,whenever i go to a place that doesn’t take app...,-1
4,apple pay is so convenient secure and easy to ...,1


## Data Cleaning

1. LowerCase all the text

In [78]:
df['Comment'] = df['Comment'].str.lower()

2. Tokenization

In [79]:
from nltk.tokenize import sent_tokenize
df['Comment'] = df['Comment'].apply(sent_tokenize)

3. Removing Punctuation

In [80]:
import re

df['Comment'] = df['Comment'].apply(lambda text: [re.sub(r'\W', ' ', i) for i in text]) 
# keeps words, numbers and spaces, removes punctuations

4. Removing basic english words

In [81]:
from nltk.corpus import stopwords

def remove_stopwords(texts): # input is list of sentences
    sentence = texts[0]
    new_sentence = []
    for word in sentence.split(' '):
        if not word in stopwords.words('english'):
            new_sentence.append(word)
    return new_sentence

In [82]:
df['Comment'] = df['Comment'].apply(remove_stopwords)

5. Stemming and Lemmanization

In [83]:
df['Comment'].iloc[55]

['face',
 'describing',
 'hops',
 'killed',
 '',
 'smell',
 'pretty',
 '',
 'dank',
 '',
 '']

In [86]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

porter = PorterStemmer()
lemma = WordNetLemmatizer()

def rooting(texts):
    # Tokenize the text into words
    words = word_tokenize(texts)
    rooted_words = []
    for word in words:
        stemmed = porter.stem(word)
        lemmatized = lemma.lemmatize(stemmed)
        rooted_words.append(lemmatized)
    return ' '.join(rooted_words)

df['Comment'] = df['Comment'].apply(rooting)

TypeError: expected string or bytes-like object, got 'list'