# Stock Sentiment Analysis using New Headlines

### Importing Libraries

In [1]:
import pandas as pd

### Reading and exploring the dataset

In [51]:
dataset = pd.read_csv('dataset.csv', encoding='ISO-8859-1')
dataset.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite
2,2000-01-05,0,Coventry caught on counter by Flo,United's rivals on the road to Rio,Thatcher issues defence before trial by video,Police help Smith lay down the law at Everton,Tale of Trautmann bears two more retellings,England on the rack,Pakistan retaliate with call for video of Walsh,Cullinan continues his Cape monopoly,...,South Melbourne (Australia),Necaxa (Mexico),Real Madrid (Spain),Raja Casablanca (Morocco),Corinthians (Brazil),Tony's pet project,Al Nassr (Saudi Arabia),Ideal Holmes show,Pinochet leaves hospital after tests,Useful links


In [7]:
dataset.columns

Index(['Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7',
       'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
       'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23',
       'Top24', 'Top25'],
      dtype='object')

In [8]:
dataset.isnull().sum()

Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
dtype: int64

In [9]:
dataset.shape

(4101, 27)

### Splitting train and test dataset

In [14]:
train = dataset[dataset['Date']<'20150101']
test = dataset[dataset['Date']>'20141231']
print("Training data shape: " + str(train.shape))
print("Testing data shape: " + str(test.shape))

Training data shape: (3975, 27)
Testing data shape: (378, 27)


### Feature Engineering 

In [20]:
# Removing puntuations which is not required for the sentimental analysis
data = train.iloc[:, 2:27]
data.replace("[^a-zA-Z]", " ", regex=True, inplace=True)

# Renaming column names for ease of access
thisList = [i for i in range(25)]
new_Index = [str(i) for i in thisList]
data.columns = new_Index
data.head(1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,A hindrance to operations extracts from the...,Scorecard,Hughes instant hit buoys Blues,Jack gets his skates on at ice cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,Derby raise a glass to Strupar s debut double,Southgate strikes Leeds pay the penalty,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl s successor drawn into scandal,The difference between men and women,Sara Denver nurse turned solicitor,Diana s landmine crusade put Tories in a panic,Yeltsin s resignation caught opposition flat f...,Russian roulette,Sold out,Recovering a title


In [22]:
# Converting headlines to Lowercase
for index in new_Index:
    data[index] = data[index].str.lower()
data.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,derby raise a glass to strupar s debut double,southgate strikes leeds pay the penalty,...,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title
1,scorecard,the best lake scene,leader german sleaze inquiry,cheerio boyo,the main recommendations,has cubie killed fees,has cubie killed fees,has cubie killed fees,hopkins furious at foster s lack of hannibal...,has cubie killed fees,...,on the critical list,the timing of their lives,dear doctor,irish court halts ira man s extradition to nor...,burundi peace initiative fades after rebels re...,pe points the way forward to the ecb,campaigners keep up pressure on nazi war crime...,jane ratcliffe,yet more things you wouldn t know without the ...,millennium bug fails to bite


In [36]:
# Joining all the sentences of a single paragarph for every record
headlines = []
for row in range(len(data.index)):
    headlines.append(" ".join(str(x) for x in data.iloc[row, 0:25]))

# checking
headlines[0]

'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

### Implement Bag Of Words (BoW)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# ngram_range represnts the (minimum_number_of_words, maximum_number_of_words)
countVector = CountVectorizer(ngram_range=(2,2))

# making the training dataset after applying BoW
trainDataset = countVector.fit_transform(headlines)


### Implement RandomForestClassifier

In [54]:
# Performing the fit
randomClassifier = RandomForestClassifier(n_estimators=200, criterion='entropy')
randomClassifier.fit(trainDataset, train['Label'])


RandomForestClassifier(criterion='entropy', n_estimators=200)

### Predict for the test Dataset

In [55]:
test_transform = []
for row in range (0, len(test.index)):
    test_transform.append(" ".join(str(x) for x in test.iloc[row, 2:27]))
    
test_dataset = countVector.transform(test_transform)
predictions = randomClassifier.predict(test_dataset)


### Creating Confusion matrix for testing purposes

In [56]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

cm = confusion_matrix(test['Label'], predictions)
print("This is a confusion matrix: ")
print(cm)

acc_score = accuracy_score(test["Label"], predictions)
print("Accuracy : ", round((acc_score*100),2), "%")

report = classification_report(test["Label"], predictions)
print("This is the report: ")
print(report)

This is a confusion matrix: 
[[144  42]
 [  9 183]]
Accuracy :  86.51 %
This is the report: 
              precision    recall  f1-score   support

           0       0.94      0.77      0.85       186
           1       0.81      0.95      0.88       192

    accuracy                           0.87       378
   macro avg       0.88      0.86      0.86       378
weighted avg       0.88      0.87      0.86       378



In [50]:
# Label = 1, means stock price will increase
# Label = 2, means stock price will decrease
