## Author @ Mahmoud Saeed


# import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import datetime


# import numeric and textual Data

In [95]:
# stock data
num_data = pd.read_csv("TCS.NS_Stock.csv")
# headlines news data
txt_data = pd.read_csv("india-news-headlines.csv")
txt_data.rename(columns={'publish_date': 'Date'},inplace = True)
txt_data['Date'] = [datetime.datetime.strptime(str(txt_data['Date'][i]),
                                               '%Y%m%d') for i in range(txt_data.shape[0])]
num_data['Date'] = pd.to_datetime(num_data.Date,format='%d-%m-%Y')

# merge both data together
merge_data = pd.merge(num_data,txt_data, 
                   on='Date', 
                   how ='inner')

In [21]:
num_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,01-01-2003,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0
1,02-01-2003,61.525002,62.924999,57.912498,58.299999,43.443855,5325328.0
2,03-01-2003,60.0,61.049999,58.5,59.012501,43.974789,4198040.0
3,06-01-2003,59.924999,60.1875,56.875,57.262501,42.670746,4121520.0
4,07-01-2003,58.0,58.5,56.0625,56.599998,42.17704,2650800.0


In [6]:
txt_data.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [79]:
txt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650970 entries, 0 to 3650969
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Date               int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 83.6+ MB


In [80]:
num_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4476 entries, 0 to 4475
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4476 non-null   object 
 1   Open       4460 non-null   float64
 2   High       4460 non-null   float64
 3   Low        4460 non-null   float64
 4   Close      4460 non-null   float64
 5   Adj Close  4460 non-null   float64
 6   Volume     4460 non-null   float64
dtypes: float64(6), object(1)
memory usage: 244.9+ KB


In [97]:
merge_data.shape

(2230896, 9)

In [22]:
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.append("and")
stop_words

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
def clean_data(tweet):
    remove_punc = [t for t in tweet if t not in string.punctuation]
    remove_punc = ''.join(remove_punc)
#     return remove_punc
    remove_punc_stopWords = [t for t in remove_punc.split() if t.lower() not in stop_words]
    remove_punc_stopWords = ' '.join(remove_punc_stopWords)
    return remove_punc_stopWords

In [98]:
merge_data['headline_text'] = merge_data['headline_text'].apply(clean_data)

In [99]:
merge_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,headline_category,headline_text
0,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Chastened BCC members decide mend ways
1,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,New look slums govt start work
2,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Global spiritual meet begin Jan 16
3,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Parameshwara allays fears scrapping CET
4,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Amusement park opens today


In [13]:
merge_data.groupby(['publish_date'])['headline_text'].count()

publish_date
20010102     86
20010103     41
20010104    153
20010105    126
20010106     12
           ... 
20220327    500
20220328    500
20220329    500
20220330    500
20220331    498
Name: headline_text, Length: 7717, dtype: int64

In [4]:
def get_polarity(txt):
    res = TextBlob(txt)
    if round(res.sentiment.polarity , 2) > 0:
        return 'positive'
    elif round(res.sentiment.polarity , 2) < 0:
        return 'negative'
    else:
        return 'nuetral'

In [24]:
merge_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2230896 entries, 0 to 2230895
Data columns (total 15 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Date               object 
 1   Open               float64
 2   High               float64
 3   Low                float64
 4   Close              float64
 5   Adj Close          float64
 6   Volume             float64
 7   headline_category  object 
 8   headline_text      object 
 9   Subjectivity       float64
 10  Polarity           float64
 11  Compound           float64
 12  Negative           float64
 13  Neutral            float64
 14  Positive           float64
dtypes: float64(12), object(3)
memory usage: 255.3+ MB


In [38]:
def get_subjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
  return  TextBlob(text).sentiment.polarity

In [100]:
merge_data['Subjectivity'] =merge_data['headline_text'].apply(get_subjectivity)
merge_data['Polarity'] = merge_data['headline_text'].apply(get_polarity)
txt_data.head()

Unnamed: 0,Date,headline_category,headline_text
0,2001-01-02,unknown,Status quo will not be disturbed at Ayodhya; s...
1,2001-01-02,unknown,Fissures in Hurriyat over Pak visit
2,2001-01-02,unknown,America's unwanted heading for India?
3,2001-01-02,unknown,For bigwigs; it is destination Goa
4,2001-01-02,unknown,Extra buses to clear tourist traffic


In [101]:
sent_analysis = SentimentIntensityAnalyzer()

merge_data['Compound'] = [sent_analysis.polarity_scores(v)['compound'] for v in merge_data['headline_text']]
merge_data['Negative'] = [sent_analysis.polarity_scores(v)['neg'] for v in merge_data['headline_text']]
merge_data['Neutral'] = [sent_analysis.polarity_scores(v)['neu'] for v in merge_data['headline_text']]
merge_data['Positive'] = [sent_analysis.polarity_scores(v)['pos'] for v in merge_data['headline_text']]
merge_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,headline_category,headline_text,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Chastened BCC members decide mend ways,0.0,0.0,0.0,0.0,1.0,0.0
1,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,New look slums govt start work,0.454545,0.136364,0.0,0.0,1.0,0.0
2,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Global spiritual meet begin Jan 16,0.066667,0.0,0.0,0.0,1.0,0.0
3,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Parameshwara allays fears scrapping CET,0.0,0.0,-0.4215,0.412,0.588,0.0
4,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Amusement park opens today,0.0,0.0,0.3612,0.0,0.545,0.455


In [30]:
merge_data.dropna(inplace =True)
merge_data.drop_duplicates(inplace = True)  
    
merge_data['sentiment'] = [get_polarity(i) for i in merge_data['headline_text']]
merge_data['sentiment']

0           nuetral
1          positive
2           nuetral
3           nuetral
4           nuetral
             ...   
2230891    positive
2230892    positive
2230893    positive
2230894     nuetral
2230895     nuetral
Name: sentiment, Length: 2206381, dtype: object

In [31]:
# dummy sentiment data
dummy_data = pd.get_dummies(merge_data['sentiment'],drop_first=True)
merge_data = pd.concat([merge_data,dummy_data],axis=1)
merge_data.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,headline_category,headline_text,Subjectivity,Polarity,Compound,Negative,Neutral,Positive,sentiment,nuetral,positive
0,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Chastened BCC members decide mend ways,0.0,0.0,0.0,0.0,1.0,0.0,nuetral,1,0
1,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,New look slums govt start work,0.454545,0.136364,0.0,0.0,1.0,0.0,positive,0,1
2,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Global spiritual meet begin Jan 16,0.066667,0.0,0.0,0.0,1.0,0.0,nuetral,1,0
3,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Parameshwara allays fears scrapping CET,0.0,0.0,-0.4215,0.412,0.588,0.0,nuetral,1,0
4,2003-01-01,59.987499,61.974998,59.987499,61.087502,45.521038,6027072.0,city.bengaluru,Amusement park opens today,0.0,0.0,0.3612,0.0,0.545,0.455,nuetral,1,0


In [32]:
merge_data.groupby(['sentiment'])['sentiment'].count()

sentiment
negative     248576
nuetral     1580161
positive     377644
Name: sentiment, dtype: int64

In [50]:
col = [ 'Open',  'High', 'Low','Volume','Compound', 'Negative', 'Neutral' ,'Positive','positive']
new_merge_data = merge_data[col]

In [51]:
col = [ 'Open',  'High', 'Low','Volume','Compound', 'Negative', 'Neutral' ,'Positive','label']
new_merge_data.columns = col

# Train an Test model

In [52]:
X = new_merge_data.drop('label' , axis = 1)
y = new_merge_data['label']

In [53]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

In [54]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(x_train,y_train)
y_rf_pred = rf.predict(x_test)
accuracy_score(y_test,y_rf_pred)

0.8468987709902329

In [49]:
x_train.shape , x_test.shape

((1544466, 10), (661915, 10))