In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import make_pipeline


df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NUS Fintech Society (ML) Project 2: Natural Language Processing classifier/stock_data.csv')

df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5786,Industry body CII said #discoms are likely to ...,-1
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1


In [81]:
df.shape

df['Sentiment'].value_counts()

 1    3685
-1    2106
Name: Sentiment, dtype: int64

In [82]:
# Check null values
df.isna().sum().to_frame(name='# of missing values')

Unnamed: 0,# of missing values
Text,0
Sentiment,0


Lowercasing is carried out to standardize the text by converting all characters to lowercase. This ensures consistency in word representation. For example, "Term" and "term" are treated as the same, preventing the model from treating them as distinct features.

URLs, punctuation, and digits are also removed since they often do not contribute meaningful information for sentiment analysis and can be noise in the data.

Lastly, stopwords refer to common English stopwords (e.g., "the," "and," "is"). These frequently occurring words often don't carry specific sentiment and can be safely disregarded for sentiment analysis.

In [83]:
# Preprocessing steps
from nltk.corpus import stopwords

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(words)

df['Text'] = df['Text'].apply(preprocess_text)

df

Unnamed: 0,Text,Sentiment
0,kickers watchlist xide tit soq pnk cpw bpz aj ...,1
1,user aap movie return feageed indicator trades...,1
2,user id afraid short amzn looking like nearmon...,1
3,mnta,1
4,oi,1
...,...,...
5786,industry body cii said discoms likely suffer n...,-1
5787,gold prices slip rs investors book profits ami...,-1
5788,workers bajaj auto agreed wage cut period apri...,1
5789,sharemarket live sensex days high points nifty...,1


Tokenize here means to split the strings into individual words without blanks or tabs. Tokenization is a fundamental step in NLP that enables the effective representation and analysis of textual data in a form suitable for machine learning models.

In [84]:
# Tokenize the news statements
tokenizer = TweetTokenizer()
df['Text'] = df['Text'].apply(tokenizer.tokenize)

Lemmatization is used here rather than stemming the words as lemmatization enhances semantic accuracy and preserves grammatical structure, providing more interpretable and contextually meaningful results compared to stemming. Despite potential computational overhead, lemmatization is favored for better accuracy here.

In [85]:
# Lemmatize the terms in the news statements
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
  return ' '.join([lemmatizer.lemmatize(token) for token in tokens])

df['Text'] = df['Text'].apply(lemmatize)

In [86]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.3, random_state=42)

Considering that the task involves sentiment analysis of stock-related news statements, BoW/Count Vectorizer is preferred here over the TF-IDF term frequency Vectorizer due to its simplicity and interpretability.

In [87]:
# Bag-Of-Words (BoW)/Count Vectorizer
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(X_train)

# Transform
bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)


While more complex models exist, the Naive Bayes classifier is favored in this context for its ability to handle sparse, high-dimensional data which aligns with the text-based sentiment analysis of stock-related news statements.

In [88]:
# Naive Bayes classifier
model = MultinomialNB()
model.fit(bow_X_train, Y_train)

In [89]:
# Predictions
Y_pred_train = model.predict(bow_X_train)
Y_pred_test = model.predict(bow_X_test)

In [90]:
# Evaluate the model
train_accuracy = accuracy_score(Y_train, Y_pred_train)
test_accuracy = accuracy_score(Y_test, Y_pred_test)
print(f"Train Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Train Accuracy: 0.90
Test Accuracy: 0.76
