In [41]:
import pandas as pd

df = pd.read_csv('./data/news_data.csv', encoding='ISO-8859-1', header=None, names=['label', 'title'])

In [42]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Preprocess the data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['title'])
X = tokenizer.texts_to_sequences(df['title'])
X = pad_sequences(X, maxlen=100)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])
y = to_categorical(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
embedding_dim = 128
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 output classes: Neutral, Positive, Negative

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 64
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
                    validation_data=(X_test, y_test), verbose=0)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Example of making predictions
predictions = model.predict(X_test)

# Save the model if needed
model.save('./data/sentiment_analysis_model.h5')


Test Accuracy: 74.85%
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step




In [43]:
# Example titles
titles = [
    "Amazon's Quarterly Earnings Beat Expectations, Stock Price Surges",
    "European Central Bank Announces Stimulus Package Amid Economic Uncertainty",
    "Tesla to Build New Gigafactory in Texas, Boosting Production Capacity",
    "Global Semiconductor Shortage Impacts Tech Industry Supply Chains",
    "Federal Government Proposes Infrastructure Spending Bill Worth $1 Trillion",
    "UK Economy Faces Slowdown as Brexit Transition Period Ends",
    "Facebook Faces Antitrust Scrutiny Over Advertising Practices",
    "Oil and Gas Prices Soar Amid Middle East Geopolitical Tensions",
    "Asian Markets Rally on Positive Economic Data from China",
    "Gold Prices Dip as Investors Turn to Riskier Assets"
]


# Assuming the same tokenizer used during training
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
X = pad_sequences(sequences, maxlen=100)  # Adjust maxlen according to your training data

# Predict sentiments
predictions = model.predict(X)

# Mapping predictions to sentiment categories
sentiments = ['Negative', 'Neutral', 'Positive']
predicted_sentiments = [sentiments[pred.argmax()] for pred in predictions]

# Print results
for i, title in enumerate(titles):
    print(f"Title: '{title}' - Predicted Sentiment: {predicted_sentiments[i]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Title: 'Amazon's Quarterly Earnings Beat Expectations, Stock Price Surges' - Predicted Sentiment: Neutral
Title: 'European Central Bank Announces Stimulus Package Amid Economic Uncertainty' - Predicted Sentiment: Neutral
Title: 'Tesla to Build New Gigafactory in Texas, Boosting Production Capacity' - Predicted Sentiment: Neutral
Title: 'Global Semiconductor Shortage Impacts Tech Industry Supply Chains' - Predicted Sentiment: Neutral
Title: 'Federal Government Proposes Infrastructure Spending Bill Worth $1 Trillion' - Predicted Sentiment: Neutral
Title: 'UK Economy Faces Slowdown as Brexit Transition Period Ends' - Predicted Sentiment: Neutral
Title: 'Facebook Faces Antitrust Scrutiny Over Advertising Practices' - Predicted Sentiment: Neutral
Title: 'Oil and Gas Prices Soar Amid Middle East Geopolitical Tensions' - Predicted Sentiment: Neutral
Title: 'Asian Markets Rally on Positive Economic Data from China' - Predi

In [44]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
oversample = RandomOverSampler(sampling_strategy={'negative': 1500, 'positive': 1500})

# Define undersampling strategy for majority class (Neutral)
undersample = RandomUnderSampler(sampling_strategy={'neutral': 1500})

# Define the resampling pipeline
resample_pipeline = Pipeline([
    ('oversampling', oversample),
    ('undersampling', undersample)
])

df_resampled, labels_resampled = resample_pipeline.fit_resample(df[['title']], df['label'])

# Convert back to dataframe
df_resampled = pd.DataFrame(df_resampled, columns=['title'])
df_resampled['label'] = labels_resampled
df_resampled

Unnamed: 0,title,label
2,The international electronic industry company ...,negative
415,A tinyurl link takes users to a scamming site ...,negative
421,"Compared with the FTSE 100 index , which rose ...",negative
423,"Compared with the FTSE 100 index , which rose ...",negative
500,One of the challenges in the oil production in...,negative
...,...,...
5874,Cramo Group 's financial targets for 2010-2013...,positive
5875,"Thereby , the company will expand its offering...",positive
5876,Passenger volumes rose by 8.4 % in the account...,positive
5877,"Rory Fitzgerald , general manager , operations...",positive


In [45]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

df = df_resampled.copy()
# Preprocess the data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['title'])
X = tokenizer.texts_to_sequences(df['title'])
X = pad_sequences(X, maxlen=100)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])
y = to_categorical(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
embedding_dim = 128
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 output classes: Neutral, Positive, Negative

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 64
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
                    validation_data=(X_test, y_test), verbose=0)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Example of making predictions
predictions = model.predict(X_test)

# Save the model if needed
model.save('./data/sentiment_analysis_model.h5')


Test Accuracy: 80.78%
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step




In [46]:
# Example titles
titles = [
    "Amazon's Quarterly Earnings Beat Expectations, Stock Price Surges",
    "European Central Bank Announces Stimulus Package Amid Economic Uncertainty",
    "Tesla to Build New Gigafactory in Texas, Boosting Production Capacity",
    "Global Semiconductor Shortage Impacts Tech Industry Supply Chains",
    "Federal Government Proposes Infrastructure Spending Bill Worth $1 Trillion",
    "UK Economy Faces Slowdown as Brexit Transition Period Ends",
    "Facebook Faces Antitrust Scrutiny Over Advertising Practices",
    "Oil and Gas Prices Soar Amid Middle East Geopolitical Tensions",
    "Asian Markets Rally on Positive Economic Data from China",
    "Gold Prices Dip as Investors Turn to Riskier Assets"
]


# Assuming the same tokenizer used during training
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(titles)
sequences = tokenizer.texts_to_sequences(titles)
X = pad_sequences(sequences, maxlen=100)  # Adjust maxlen according to your training data

# Predict sentiments
predictions = model.predict(X)

# Mapping predictions to sentiment categories
sentiments = ['Negative', 'Neutral', 'Positive']
predicted_sentiments = [sentiments[pred.argmax()] for pred in predictions]

# Print results
for i, title in enumerate(titles):
    print(f"Title: '{title}' - Predicted Sentiment: {predicted_sentiments[i]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Title: 'Amazon's Quarterly Earnings Beat Expectations, Stock Price Surges' - Predicted Sentiment: Neutral
Title: 'European Central Bank Announces Stimulus Package Amid Economic Uncertainty' - Predicted Sentiment: Neutral
Title: 'Tesla to Build New Gigafactory in Texas, Boosting Production Capacity' - Predicted Sentiment: Neutral
Title: 'Global Semiconductor Shortage Impacts Tech Industry Supply Chains' - Predicted Sentiment: Neutral
Title: 'Federal Government Proposes Infrastructure Spending Bill Worth $1 Trillion' - Predicted Sentiment: Neutral
Title: 'UK Economy Faces Slowdown as Brexit Transition Period Ends' - Predicted Sentiment: Neutral
Title: 'Facebook Faces Antitrust Scrutiny Over Advertising Practices' - Predicted Sentiment: Negative
Title: 'Oil and Gas Prices Soar Amid Middle East Geopolitical Tensions' - Predicted Sentiment: Positive
Title: 'Asian Markets Rally on Positive Economic Data from China' - Pre