In [1]:
import warnings
warnings.filterwarnings("ignore")

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

from textblob import TextBlob
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

#from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Loricson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Loricson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Loricson\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Load the CSV file
cnbcNews = pd.read_csv('cnbc_headlines.csv')

## EDA

In [3]:
cnbcNews.isnull().sum()

Headlines      280
Time           280
Description    280
dtype: int64

In [4]:
# dropping rows with NA headlines
cnbcNews = cnbcNews.dropna()

In [5]:
cnbcNews.isnull().sum()

Headlines      0
Time           0
Description    0
dtype: int64

In [6]:
# Count the duplicate rows
cnbcNews.duplicated().sum()

0

In [7]:
# Drop the duplicate rows in the dataset keep the first one
cnbcNews = cnbcNews.drop_duplicates(keep='first')

In [8]:
cnbcNews.shape

(2800, 3)

## Preprocessing text

In [9]:
def preprocess_text(text, stemming=False, lemmatizing=False):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    
    # Apply stemming if specified
    if stemming:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    
    # Apply lemmatizing if specified
    if lemmatizing:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the preprocessed words back into a sentence
    processed_text = ' '.join(words)
    
    return processed_text  

## Sentiment Analysis

In [26]:
def get_sentiment_label(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    
    # Decide sentiment label based on compound score
    compound_score = sentiment_scores['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Example usage
headline = "Tim cook resigning"
sentiment_label = get_sentiment_label(headline)
print("Sentiment Label:", sentiment_label)

cnbcNews['Description'] = cnbcNews['Description'].apply(preprocess_text)

Sentiment Label: Negative
