In [47]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import contractions
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer

data = pd.read_csv("C:\\Users\\Marco\\youtoxic_english_1000.csv")
sentences = data['Text']
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

def expand_contractions(text):
    return contractions.fix(text)

nltk_results = []
for sentence in sentences:
    expanded_sentence = expand_contractions(sentence)
    tokens = word_tokenize(expanded_sentence) 
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    nltk_results.append((tokens, lemmas))

textblob_results = []
for sentence in sentences:
    expanded_sentence = expand_contractions(sentence)
    blob = TextBlob(expanded_sentence)
    tokens = blob.words
    lemmas = [word.lemmatize() for word in blob.words]
    textblob_results.append((tokens, lemmas))

nltk_token_count = Counter([token for tokens, _ in nltk_results for token in tokens])
nltk_lemma_count = Counter([lemma for _, lemmas in nltk_results for lemma in lemmas])
textblob_token_count = Counter([token for tokens, _ in textblob_results for token in tokens])
textblob_lemma_count = Counter([lemma for _, lemmas in textblob_results for lemma in lemmas])
avg_nltk_token_length = sum(len(token) for tokens, _ in nltk_results for token in tokens) / sum(len(tokens) for tokens, _ in nltk_results)
avg_nltk_lemma_length = sum(len(lemma) for _, lemmas in nltk_results for lemma in lemmas) / sum(len(lemmas) for _, lemmas in nltk_results)
avg_textblob_token_length = sum(len(token) for tokens, _ in textblob_results for token in tokens) / sum(len(tokens) for tokens, _ in textblob_results)
avg_textblob_lemma_length = sum(len(lemma) for _, lemmas in textblob_results for lemma in lemmas) / sum(len(lemmas) for _, lemmas in textblob_results)

textblob_sentiments = [TextBlob(sentence).sentiment for sentence in sentences]
nltk_sentiments = []
for sentence in sentences:
    expanded_sentence = expand_contractions(sentence)
    sentiment = sia.polarity_scores(expanded_sentence)
    nltk_sentiments.append(sentiment)
    
avg_textblob_polarity = sum(sentiment.polarity for sentiment in textblob_sentiments) / len(textblob_sentiments)
avg_textblob_subjectivity = sum(sentiment.subjectivity for sentiment in textblob_sentiments) / len(textblob_sentiments)

avg_nltk_compound = sum(sentiment['compound'] for sentiment in nltk_sentiments) / len(nltk_sentiments)


In [39]:
for i, sentence in enumerate(sentences):
    print(f"Sentence: {sentence}")
    print()

Sentence: If only people would just take a step back and not make this case about them, because it wasn't about anyone except the two people in that situation.  To lump yourself into this mess and take matters into your own hands makes these kinds of protests selfish and without rational thought and investigation.  The guy in this video is heavily emotional and hyped up and wants to be heard, and when he gets heard he just presses more and more.  He was never out to have a reasonable discussion.  Kudos to the Smerconish for keeping level the whole time and letting Masri make himself out to be a fool.  How dare he and those that tore that city down in protest make this about themselves and to dishonor the entire incident with their own hate.  By the way, since when did police brutality become an epidemic?  I wish everyone would just stop pretending like they were there and they knew EXACTLY what was going on, because there's no measurable amount of people that honestly witnessed this in

In [40]:
for i, sentence in enumerate(sentences):
    print(f"NLTK Tokens: {nltk_results[i][0]}")
    print(f"TextBlob Tokens: {textblob_results[i][0]}")
    print()

NLTK Tokens: ['If', 'only', 'people', 'would', 'just', 'take', 'a', 'step', 'back', 'and', 'not', 'make', 'this', 'case', 'about', 'them', ',', 'because', 'it', 'was', 'not', 'about', 'anyone', 'except', 'the', 'two', 'people', 'in', 'that', 'situation', '.', 'To', 'lump', 'yourself', 'into', 'this', 'mess', 'and', 'take', 'matters', 'into', 'your', 'own', 'hands', 'makes', 'these', 'kinds', 'of', 'protests', 'selfish', 'and', 'without', 'rational', 'thought', 'and', 'investigation', '.', 'The', 'guy', 'in', 'this', 'video', 'is', 'heavily', 'emotional', 'and', 'hyped', 'up', 'and', 'wants', 'to', 'be', 'heard', ',', 'and', 'when', 'he', 'gets', 'heard', 'he', 'just', 'presses', 'more', 'and', 'more', '.', 'He', 'was', 'never', 'out', 'to', 'have', 'a', 'reasonable', 'discussion', '.', 'Kudos', 'to', 'the', 'Smerconish', 'for', 'keeping', 'level', 'the', 'whole', 'time', 'and', 'letting', 'Masri', 'make', 'himself', 'out', 'to', 'be', 'a', 'fool', '.', 'How', 'dare', 'he', 'and', 'thos

In [41]:
for i, sentence in enumerate(sentences):
    print(f"NLTK Lemmas: {nltk_results[i][1]}")
    print(f"TextBlob Lemmas: {textblob_results[i][1]}")
    print()

NLTK Lemmas: ['If', 'only', 'people', 'would', 'just', 'take', 'a', 'step', 'back', 'and', 'not', 'make', 'this', 'case', 'about', 'them', ',', 'because', 'it', 'wa', 'not', 'about', 'anyone', 'except', 'the', 'two', 'people', 'in', 'that', 'situation', '.', 'To', 'lump', 'yourself', 'into', 'this', 'mess', 'and', 'take', 'matter', 'into', 'your', 'own', 'hand', 'make', 'these', 'kind', 'of', 'protest', 'selfish', 'and', 'without', 'rational', 'thought', 'and', 'investigation', '.', 'The', 'guy', 'in', 'this', 'video', 'is', 'heavily', 'emotional', 'and', 'hyped', 'up', 'and', 'want', 'to', 'be', 'heard', ',', 'and', 'when', 'he', 'get', 'heard', 'he', 'just', 'press', 'more', 'and', 'more', '.', 'He', 'wa', 'never', 'out', 'to', 'have', 'a', 'reasonable', 'discussion', '.', 'Kudos', 'to', 'the', 'Smerconish', 'for', 'keeping', 'level', 'the', 'whole', 'time', 'and', 'letting', 'Masri', 'make', 'himself', 'out', 'to', 'be', 'a', 'fool', '.', 'How', 'dare', 'he', 'and', 'those', 'that',

In [42]:
for i, sentence in enumerate(sentences):
    print(f"TextBlob Sentiment (Polarity, Subjectivity): {textblob_sentiments[i]}")
    print(f"NLTK Sentiment (Positive, Neutral, Negative, Compound): {nltk_sentiments[i]}")
    print()

TextBlob Sentiment (Polarity, Subjectivity): Sentiment(polarity=0.1322063492063492, subjectivity=0.556984126984127)
NLTK Sentiment (Positive, Neutral, Negative, Compound): {'neg': 0.105, 'neu': 0.801, 'pos': 0.094, 'compound': -0.6678}

TextBlob Sentiment (Polarity, Subjectivity): Sentiment(polarity=0.0, subjectivity=0.0)
NLTK Sentiment (Positive, Neutral, Negative, Compound): {'neg': 0.344, 'neu': 0.48, 'pos': 0.175, 'compound': -0.8529}

TextBlob Sentiment (Polarity, Subjectivity): Sentiment(polarity=0.12083333333333335, subjectivity=0.35833333333333334)
NLTK Sentiment (Positive, Neutral, Negative, Compound): {'neg': 0.162, 'neu': 0.771, 'pos': 0.067, 'compound': -0.8957}

TextBlob Sentiment (Polarity, Subjectivity): Sentiment(polarity=0.13738095238095238, subjectivity=0.36642857142857144)
NLTK Sentiment (Positive, Neutral, Negative, Compound): {'neg': 0.163, 'neu': 0.754, 'pos': 0.083, 'compound': -0.8335}

TextBlob Sentiment (Polarity, Subjectivity): Sentiment(polarity=0.1428571428

In [43]:
print(f"NLTK Token frequency: {nltk_token_count}")
print(f"NLTK Lemma frequency: {nltk_lemma_count}")



In [44]:
print(f"TextBlob Token frequency: {textblob_token_count}")
print(f"TextBlob Lemma frequency: {textblob_lemma_count}")



In [45]:
print(f"Average NLTK token length: {avg_nltk_token_length}")
print(f"Average NLTK lemma length: {avg_nltk_lemma_length}")
print(f"Average TextBlob token length: {avg_textblob_token_length}")
print(f"Average TextBlob lemma length: {avg_textblob_lemma_length}")

Average NLTK token length: 3.9581128747795415
Average NLTK lemma length: 3.8976812947401185
Average TextBlob token length: 4.286329261198371
Average TextBlob lemma length: 4.218528214077952


In [48]:
print(f"Average TextBlob Polarity: {avg_textblob_polarity}")
print(f"Average TextBlob Subjectivity: {avg_textblob_subjectivity}")
print(f"Average NLTK Compound Score: {avg_nltk_compound}")

Average TextBlob Polarity: 0.010115410935007075
Average TextBlob Subjectivity: 0.4145070637705122
Average NLTK Compound Score: -0.17929090000000014
