In [3]:
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np


import nltk
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.models import Sequential
from tokenizers import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split 

In [5]:
def preprocess_text(text):

    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", text)
    # Tokenize the text into words
    words = word_tokenize(text.lower())

    words = [word for word in words if word not in string.punctuation]

    # Remove stop words
    words = " ".join([word for word in words if word.isalpha()]).lower()    
        
    return words


In [31]:
url = "https://www.nbcnews.com/politics/2024-election/biden-rakes-25-million-new-york-fundraiser-obama-clinton-rcna145534"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

parsed_url = urlparse(url).netloc
base_url = parsed_url[4:]

text = []
# Find all <p> elements and print their text
for paragraph in soup.find_all('p'):
    text.append(preprocess_text(paragraph.get_text()))

In [32]:
text = [phrase for phrase in text if phrase.count(' ') > 2]
text

['there are no new alerts at this time',
 'president joe biden was joined thursday by two of his democratic predecessors for a star studded fundraiser at radio city music hall that his campaign said brought in more than million',
 'former presidents barack obama and bill clinton participated in the event in new york with more than supporters in attendance including several protesters who interrupted the program when the three presidents were speaking',
 'actor and comedian mindy kaling hosted the program which ended at around p m and late night host stephen colbert moderated a conversation with biden clinton and obama special guests include celebrities like queen latifah lizzo ben platt cynthia erivo and lea michele',
 'during the nearly hourlong moderated conversation colbert joked that the moment was historic because three presidents have come to new york and not one of them to appear in court taking a jab at former president donald trump s criminal indictments and civil trials',
 'c

### TextBlob: Determining Polarity

In [33]:
polarity_score = []

for phrase in text:
    polarity_score.append(TextBlob(phrase).sentiment.polarity)


#### polarity denotes the sentiment of text. values lie in [-1,1]. -1 denotes a highly negative sentiment and 1 denotes a hightly positive sentiment

In [34]:
polarity_min = min(polarity_score)
polarity_max = max(polarity_score)
polarity_std = np.std(polarity_score)
polarity_mean = np.mean(polarity_score)
polarity_range = polarity_max - polarity_min

In [35]:
print("Min: ",polarity_min)
print("Max: ",polarity_max)
print("Standard Deviation: ", polarity_std)
print("Mean: ", polarity_mean)
print("Range: ",polarity_range)

Min:  -0.16666666666666666
Max:  0.8500000000000001
Standard Deviation:  0.23332012715639822
Mean:  0.13822353638425067
Range:  1.0166666666666668


### TextBlob: Subjectivity analysis

In [36]:
subjectivity_score = []

for phrase in text:
    subjectivity_score.append(TextBlob(phrase).sentiment.subjectivity)

In [37]:
#subjectivity_score

In [38]:
subjectivity_min = min(subjectivity_score)
subjectivity_max = max(subjectivity_score)
subjectivity_std = np.std(subjectivity_score)
subjectivity_mean = np.mean(subjectivity_score)
subjectivity_range = subjectivity_max - subjectivity_min

In [39]:
print("Min: ",subjectivity_min)
print("Max: ",subjectivity_max)
print("Standard Deviation: ", subjectivity_std)
print("median: ", np.median(subjectivity_score))
print("Mean: ", subjectivity_mean)
print("Range: ",subjectivity_range)

Min:  0.0
Max:  1.0
Standard Deviation:  0.2465389450238324
median:  0.3333333333333333
Mean:  0.30681823335394764
Range:  1.0


### Sentiment Analysis with LSTM

In [124]:
df = pd.read_csv('./data/articles1.csv')


In [96]:
content_array = df['content'].values

In [97]:
polarity_score = []

for phrase in content_array:
    polarity_score.append(TextBlob(phrase).sentiment.polarity)

# polarity_score

In [98]:
tlow = np.percentile(polarity_score, 33)
thigh = np.percentile(polarity_score, 66)
print(tlow)
print(thigh)

0.03997114079477093
0.10316472712094597


In [99]:
# Function to analyze sentiment and return sentiment label
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    
    if polarity > thigh:
        return 2
    elif polarity < tlow:
        return 0
    else:
        return 1

# Apply sentiment analysis function to the 'content' column and create a new 'sentiment' column
df['sentiment'] = df['content'].apply(get_sentiment)

In [100]:
df.groupby('sentiment')['sentiment'].value_counts()

sentiment
0    16500
1    16500
2    17000
Name: count, dtype: int64

In [101]:
def cleaning(df, stop_words):
    df['content'] = df['content'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # Replacing the digits/numbers
    df['content'] = df['content'].str.replace('d', '')
    # Removing stop words
    df['content'] = df['content'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # Lemmatization
    #df['content'] = df['content'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    return df

stop_words = stopwords.words('english')
data_cleaned = cleaning(df, stop_words)

In [102]:
data_cleaned

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,sentiment
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,washington — congressional republicans new fea...,0
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"bullet shells get counte, bloo ries votive can...",0
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"walt isney’s “bambi” opene 1942, critics prais...",1
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"eath may great equalizer, isn’t necessarily ev...",2
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"seoul, south korea — north korea’s leaer, kim ...",2
...,...,...,...,...,...,...,...,...,...,...,...
49995,53287,73465,"Rex Tillerson Says Climate Change Is Real, but …",Atlantic,Robinson Meyer,2017-01-11,2017.0,1.0,,"chairman ceo exxonmobil, rex tillerson amitte ...",1
49996,53288,73466,The Biggest Intelligence Questions Raised by t...,Atlantic,Amy Zegart,2017-01-11,2017.0,1.0,,i’ve spent nearly 20 years looking intelligenc...,1
49997,53289,73467,Trump Announces Plan That Does Little to Resol...,Atlantic,Jeremy Venook,2017-01-11,2017.0,1.0,,onal trump taking necessary steps resolve conf...,1
49998,53290,73468,Dozens of For-Profit Colleges Could Soon Close,Atlantic,Emily DeRuy,2017-01-11,2017.0,1.0,,ozens colleges coul force close next several y...,2


In [115]:
data_cleaned.to_csv('./data/articles2.csv')

In [123]:
df1 = pd.read_csv("./data/articles2.csv")
df1 = cleaning(df, stop_words)

In [122]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


vectorizer = CountVectorizer(stop_words='english')
x=vectorizer.fit_transform(df1['content'])

y = df1['sentiment']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

y_pred = mnb_classifier.predict(X_test)

print('Accuracy:', accuracy_score(y_pred, y_test))

KeyError: 'content'

In [75]:
vector_x = vectorizer.transform(text)
predictions = mnb_classifier.predict(vector_x)

sentiment_map = {0:'negative',1:'neutral',2:'positive'}
predicted_sentiments = [sentiment_map[pred] for pred in predictions]

print(predictions.mean())

1.4
