# **Connecting from Drive**

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Importing Dataset**

In [40]:
news=pd.read_csv("/content/drive/MyDrive/Stock-price-prediction-using-NLP-main/RedditNews.csv")
news.head()

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [41]:
stock=pd.read_csv("/content/drive/MyDrive/Stock-price-prediction-using-NLP-main/stock.csv")
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,07-01-2016,17924.24023,18002.38086,17916.91016,17949.36914,82160000,17949.36914
1,6/30/2016,17712.75977,17930.60938,17711.80078,17929.99023,133030000,17929.99023
2,6/29/2016,17456.01953,17704.50977,17456.01953,17694.67969,106380000,17694.67969
3,6/28/2016,17190.50977,17409.7207,17190.50977,17409.7207,112190000,17409.7207
4,6/27/2016,17355.21094,17355.21094,17063.08008,17140.24023,138740000,17140.24023


In [42]:
# Changing to Data column

stock['Date'] = pd.to_datetime(stock['Date'])
news['Date'] = pd.to_datetime(news['Date'])

In [43]:
min_val = news['Date'].min()
max_val = news['Date'].max()
print(f"Minimum value of column 'A': {min_val}")
print(f"Maximum value of column 'A': {max_val}")

Minimum value of column 'A': 2008-06-08 00:00:00
Maximum value of column 'A': 2016-07-01 00:00:00


In [44]:
min_val = stock['Date'].min()
max_val = stock['Date'].max()

print(f"Minimum value of column 'A': {min_val}")
print(f"Maximum value of column 'A': {max_val}")

Minimum value of column 'A': 2008-08-08 00:00:00
Maximum value of column 'A': 2016-07-01 00:00:00


In [45]:
merged_df = pd.merge(stock, news, on='Date')
merged_df = merged_df.sort_values(by='Date')
merged_df = merged_df.reset_index(drop=True)
merged_df


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,News
0,2008-08-08,11432.08984,11759.95996,11388.04004,11734.32031,212830000,11734.32031,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-08,11432.08984,11759.95996,11388.04004,11734.32031,212830000,11734.32031,"b""Georgia 'downs two Russian warplanes' as cou..."
2,2008-08-08,11432.08984,11759.95996,11388.04004,11734.32031,212830000,11734.32031,b'BREAKING: Musharraf to be impeached.'
3,2008-08-08,11432.08984,11759.95996,11388.04004,11734.32031,212830000,11734.32031,b'Russia Today: Columns of troops roll into So...
4,2008-08-08,11432.08984,11759.95996,11388.04004,11734.32031,212830000,11734.32031,b'Russian tanks are moving towards the capital...
...,...,...,...,...,...,...,...,...
49713,2016-07-01,17924.24023,18002.38086,17916.91016,17949.36914,82160000,17949.36914,"Venezuela, where anger over food shortages is ..."
49714,2016-07-01,17924.24023,18002.38086,17916.91016,17949.36914,82160000,17949.36914,A Hindu temple worker has been killed by three...
49715,2016-07-01,17924.24023,18002.38086,17916.91016,17949.36914,82160000,17949.36914,Ozone layer hole seems to be healing - US &amp...
49716,2016-07-01,17924.24023,18002.38086,17916.91016,17949.36914,82160000,17949.36914,Taiwanese warship accidentally fires missile t...


**# When cleaning text data, some common issues that you may want to address include:**

1. Removing unwanted characters such as special characters or punctuation marks.

2. Converting all text to lowercase or uppercase.

3. Removing stop words such as "the", "and", "a", etc.
4. Removing numbers or digits.

5. Correcting spelling mistakes or abbreviations.

6. Removing URLs or email addresses.

7. Removing HTML tags.

8. Removing leading or trailing whitespaces.

# **When creating a sentiment model, there are some processing steps that you may want to avoid or be cautious about, as they can potentially affect the accuracy and performance of your model.**

Here are a few processing steps you may want to avoid:

1. Stemming or lemmatization: While stemming or lemmatization can reduce the number of unique words in your text data, it can also change the meaning of some words, which can affect the accuracy of your sentiment model. For example, "happy" and "happier" would both be stemmed to "happi", which may not be ideal for sentiment analysis.

2. Removing negation words: Negation words like "not" or "never" can significantly change the sentiment of a sentence. Therefore, it may be better to keep them in your text data rather than removing them. However, you may want to take into account their effect on sentiment in your model by, for example, reversing the polarity of the sentiment of the words that follow them.

3. Removing stopwords: Stopwords are common words like "the", "and", "a", etc., which are often removed to reduce the dimensionality of the text data. However, some stopwords can carry sentiment, such as "not" or "very". Therefore, removing all stopwords may not be ideal for sentiment analysis.

4. Removing rare words: Removing rare words can reduce the complexity of your model, but it can also remove valuable sentiment-related words that may not occur frequently in your text data.

5. Over-sampling or under-sampling: Over-sampling or under-sampling can help balance the class distribution of your data, but it can also introduce bias into your model. Therefore, it's important to be cautious when using these techniques and to evaluate their impact on your model's performance.

6. Removing context: Sentiment analysis relies heavily on context, so removing too much context from your text data can negatively impact the accuracy of your model. Therefore, it's important to consider the context of each text sample when analyzing sentiment.

In [46]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
stopwords_to_not_remove = ['not', 'no', "don't", "can't", "won't", "shouldn't", "wouldn't", "mustn't", "haven't", "hasn't", "hadn't", "isn't", "weren't", "wasn't", "aren't", "can", "will", "should", "would", "could", "may", "might", "very", "too", "so"]


**Removing stop words**

In [48]:
merged_df['News'] = merged_df['News'].apply(lambda x: ' '.join([word for word in x.split() if (word not in stop_words) or (word in stopwords_to_not_remove) ]))

# **Special Cases of Punctuation and Symbols**

When performing sentiment analysis, it is generally recommended to keep certain punctuation marks and special characters that may convey important information about the sentiment of a sentence or text. Here are some common punctuation marks and special characters that you may want to keep:

1. Exclamation points (!): These are often used to express strong emotions such as excitement, happiness, or anger.

2. Question marks (?): These indicate a question, which may convey uncertainty or confusion.

3. Emoticons and emojis: These can provide additional context to the text, especially in social media or informal communication.

4. Capitalization: Words that are capitalized can indicate emphasis or importance.

5. Ellipses (...): These can indicate a trailing off or hesitation, which may convey uncertainty or a lack of confidence.

However, it is important to note that the specific set of punctuation marks and special characters to keep or remove may depend on the specific domain or context of your sentiment analysis task. For example, in a formal setting such as legal or academic writing, the use of exclamation points may be rare and the use of ellipses may be more common, and vice versa in a more informal setting such as social media.

Punctuation

In [50]:
punct_to_keep = "!?..."

merged_df['News'] = merged_df['News'].apply(lambda x: "".join([char if (char.isalnum() or char in punct_to_keep or char.isspace()) else "" for char in x]))

Removing the Character "b" occuring at the start of most of the sentences

In [51]:
# remove all rows where the text column starts with 'b'
merged_df = merged_df[~merged_df['News'].str.startswith('b')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,News
11920,2010-07-01,9773.269531,9795.480469,9621.889648,9732.530273,262820000,9732.530273,Palestinian leader gives interview Israel Isra...
11921,2010-07-01,9773.269531,9795.480469,9621.889648,9732.530273,262820000,9732.530273,UK government launches Your Freedom seeks pub...
11922,2010-07-01,9773.269531,9795.480469,9621.889648,9732.530273,262820000,9732.530273,Blair receive US peace medal not Onion
11923,2010-07-01,9773.269531,9795.480469,9621.889648,9732.530273,262820000,9732.530273,May I introduce Germanys new first lady! And y...
11924,2010-07-01,9773.269531,9795.480469,9621.889648,9732.530273,262820000,9732.530273,UK Iraq Whistleblower Was Too Weak To Have Kil...
...,...,...,...,...,...,...,...,...
49713,2016-07-01,17924.240230,18002.380860,17916.910160,17949.369140,82160000,17949.369140,Venezuela anger food shortages still mounting ...
49714,2016-07-01,17924.240230,18002.380860,17916.910160,17949.369140,82160000,17949.369140,A Hindu temple worker killed three men motorcy...
49715,2016-07-01,17924.240230,18002.380860,17916.910160,17949.369140,82160000,17949.369140,Ozone layer hole seems healing US amp UK team...
49716,2016-07-01,17924.240230,18002.380860,17916.910160,17949.369140,82160000,17949.369140,Taiwanese warship accidentally fires missile t...


In [52]:
# !pip install spellchecker

In [53]:
# from spellchecker import SpellChecker

# # create a SpellChecker instance
# spell = SpellChecker()

# # example list of words with spelling mistakes
# words = ['dictionery', 'mispeled', 'accidantally', 'relevant', 'categry', 'wich']

# # find the misspelled words
# misspelled = spell.unknown(words)

# print(misspelled)


**Removing URL and Emails if any**

In [54]:
url_regex = r'http\S+|www.\S+'
email_regex = r'\S+@\S+'

# remove URLs and email addresses from text column using regular expressions
merged_df['News'] = merged_df['News'].str.replace(url_regex, '', regex=True)
merged_df['News'] = merged_df['News'].str.replace(email_regex, '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['News'] = merged_df['News'].str.replace(url_regex, '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['News'] = merged_df['News'].str.replace(email_regex, '', regex=True)


**Removing spaces**

In [55]:
merged_df['News'] = merged_df['News'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['News'] = merged_df['News'].str.strip()


## **Sentiment Calculator**

In [57]:
from textblob import TextBlob

# define a function to calculate sentiment polarity
def calculate_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# apply the function to a pandas text column
merged_df['sentiment_polarity'] = merged_df['News'].apply(calculate_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['sentiment_polarity'] = merged_df['News'].apply(calculate_sentiment)


In [58]:
# !pip install vaderSentiment

In [59]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# define a function to calculate sentiment scores
def calculate_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

# apply the function to a pandas text column
merged_df['sentiment_polarity2'] = merged_df['News'].apply(calculate_sentiment)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['sentiment_polarity2'] = merged_df['News'].apply(calculate_sentiment)


## **Modelling**

There is a general belief that stock prices are influenced by historical prices and trends, as well as other factors such as news, events, and economic indicators. LSTM models, with their ability to capture sequential patterns and long-term dependencies, are well-suited for modeling time-series data like stock prices.

By training on historical stock price data and incorporating additional features such as sentiment analysis of news related to the company, an LSTM model may be able to learn patterns and relationships that can help predict future stock prices. However, it is important to note that stock price prediction is a complex and uncertain task, and even the best models may not always be accurate.

In [60]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


## **Model 1 : With Sentiment parameters**

In [61]:
df = merged_df[['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'sentiment_polarity']]
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_df = scaler.fit_transform(df)

# Split into training and testing data
train_size = int(len(scaled_df) * 0.7)
test_size = len(scaled_df) - train_size
train_data = scaled_df[0:train_size, :]
test_data = scaled_df[train_size:len(scaled_df), :]

# Define function to create input/output sequences for LSTM model
def create_dataset(dataset, lookback=1):
    X, Y = [], []
    for i in range(len(dataset) - lookback - 1):
        a = dataset[i:(i + lookback), :]
        X.append(a)
        Y.append(dataset[i + lookback, 0])
    return np.array(X), np.array(Y)

lookback = 60
X_train, Y_train = create_dataset(train_data, lookback)
X_test, Y_test = create_dataset(test_data, lookback)



# Define LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(lookback, 7)))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Train LSTM model
model.fit(X_train, Y_train, epochs=10, batch_size=16, verbose=2)

# Test LSTM model
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

Y_train=Y_train.reshape(-1,1)
Y_test=Y_test.reshape(-1,1)


from sklearn.metrics import mean_squared_error

# Assuming Y_pred and Y_true are two columns with shape (26396, 1)
mse = mean_squared_error(Y_train, train_predict)
rms = np.sqrt(mse)

print("RMS value test:", rms)


from sklearn.metrics import mean_squared_error

# Assuming Y_pred and Y_true are two columns with shape (26396, 1)
mse = mean_squared_error(test_predict, Y_test)
rms = np.sqrt(mse)
print("RMS value test:", rms)

Epoch 1/10
1650/1650 - 258s - loss: 8.5479e-04 - 258s/epoch - 156ms/step
Epoch 2/10
1650/1650 - 245s - loss: 3.7243e-04 - 245s/epoch - 149ms/step
Epoch 3/10
1650/1650 - 255s - loss: 2.9010e-04 - 255s/epoch - 155ms/step
Epoch 4/10
1650/1650 - 250s - loss: 2.3658e-04 - 250s/epoch - 152ms/step
Epoch 5/10
1650/1650 - 258s - loss: 1.9529e-04 - 258s/epoch - 157ms/step
Epoch 6/10
1650/1650 - 245s - loss: 1.6404e-04 - 245s/epoch - 149ms/step
Epoch 7/10
1650/1650 - 242s - loss: 1.3959e-04 - 242s/epoch - 147ms/step
Epoch 8/10
1650/1650 - 242s - loss: 1.2901e-04 - 242s/epoch - 147ms/step
Epoch 9/10
1650/1650 - 243s - loss: 1.2204e-04 - 243s/epoch - 147ms/step
Epoch 10/10
1650/1650 - 246s - loss: 1.1119e-04 - 246s/epoch - 149ms/step
RMS value test: 0.010552372891594658
RMS value test: 0.01675762830833294


## **Model 2 : Without Sentiment parameters**

In [66]:
df2 = merged_df[['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']]
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_df = scaler.fit_transform(df2)

# Split into training and testing data
train_size = int(len(scaled_df) * 0.7)
test_size = len(scaled_df) - train_size
train_data = scaled_df[0:train_size, :]
test_data = scaled_df[train_size:len(scaled_df), :]

# Define function to create input/output sequences for LSTM model
def create_dataset(dataset, lookback=1):
    X, Y = [], []
    for i in range(len(dataset) - lookback - 1):
        a = dataset[i:(i + lookback), :]
        X.append(a)
        Y.append(dataset[i + lookback, 0])
    return np.array(X), np.array(Y)

lookback = 60
X_train, Y_train = create_dataset(train_data, lookback)
X_test, Y_test = create_dataset(test_data, lookback)



# Define LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(lookback, 6)))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')


In [67]:
# Train LSTM model
model.fit(X_train, Y_train, epochs=10, batch_size=16, verbose=2)

# Test LSTM model
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

Y_train=Y_train.reshape(-1,1)
Y_test=Y_test.reshape(-1,1)


from sklearn.metrics import mean_squared_error

# Assuming Y_pred and Y_true are two columns with shape (26396, 1)
mse = mean_squared_error(Y_train, train_predict)
rms = np.sqrt(mse)

print("RMS value test:", rms)


from sklearn.metrics import mean_squared_error

# Assuming Y_pred and Y_true are two columns with shape (26396, 1)
mse = mean_squared_error(test_predict, Y_test)
rms = np.sqrt(mse)
print("RMS value test:", rms)

Epoch 1/10
1650/1650 - 273s - loss: 0.0011 - 273s/epoch - 165ms/step
Epoch 2/10
1650/1650 - 261s - loss: 3.6965e-04 - 261s/epoch - 158ms/step
Epoch 3/10
1650/1650 - 261s - loss: 2.9464e-04 - 261s/epoch - 158ms/step
Epoch 4/10
1650/1650 - 262s - loss: 2.3141e-04 - 262s/epoch - 159ms/step
Epoch 5/10
1650/1650 - 263s - loss: 1.8801e-04 - 263s/epoch - 159ms/step
Epoch 6/10
1650/1650 - 260s - loss: 1.6222e-04 - 260s/epoch - 157ms/step
Epoch 7/10
1650/1650 - 261s - loss: 1.4394e-04 - 261s/epoch - 158ms/step
Epoch 8/10
1650/1650 - 252s - loss: 1.2468e-04 - 252s/epoch - 153ms/step
Epoch 9/10
1650/1650 - 260s - loss: 1.2120e-04 - 260s/epoch - 158ms/step
Epoch 10/10
1650/1650 - 265s - loss: 1.1364e-04 - 265s/epoch - 161ms/step
RMS value test: 0.0063197766947066265
RMS value test: 0.01579062033078758


In [68]:
# Invert scaling to get actual prices
# train_predict = scaler.inverse_transform(train_predict)
# Y_train = scaler.inverse_transform([Y_train])
# test_predict = scaler.inverse_transform(test_predict)
# Y_test = scaler.inverse_transform([Y_test])

# Evaluate LSTM model
# train_score = np.sqrt(mean_squared_error(Y_train[0], train_predict[:, 0]))
# test_score = np.sqrt(mean_squared_error(Y_test[0], test_predict[:, 0]))
# print('Train Score: %.2f RMSE' % train_score)
# print('Test Score: %.2f RMSE' % test_score)