In [15]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# read the dataset
df = pd.read_csv('covid_2021_1.csv')
# remove null values and duplicates
df.dropna(inplace=True)
df.drop_duplicates(subset='comment_text', inplace=True)
# tokenize comments in words
nltk.download('punkt')
df['tokens'] = df['comment_text'].apply(nltk.word_tokenize)
# perform sentiment analysis
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['comment_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
# calculate percentage of positive, negative, and neutral comments
total_comments = len(df)
positive_comments = len(df[df['sentiment'] > 0])
negative_comments = len(df[df['sentiment'] < 0])
neutral_comments = len(df[df['sentiment'] == 0])
positive_percentage = (positive_comments / total_comments) * 100
negative_percentage = (negative_comments / total_comments) * 100
neutral_percentage = (neutral_comments / total_comments) * 100

#print("\n",'Total Comments:', df['tokens'],"\n")

# print the results
print('Total Comments:', total_comments)
print('Positive Comments:', positive_comments, '(', positive_percentage, '%)')
print('Negative Comments:', negative_comments, '(', negative_percentage, '%)')
print('Neutral Comments:', neutral_comments, '(', neutral_percentage, '%)')

[nltk_data] Downloading package punkt to /home/sujal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/sujal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Total Comments: 40092
Positive Comments: 16366 ( 40.821111443679534 %)
Negative Comments: 10077 ( 25.134690212511224 %)
Neutral Comments: 13649 ( 34.04419834380924 %)
