In [1]:
#--predict stock market increase/decrease based on news headlines
#--data reading
#--data preprocessing within dataframe
#--prepare test and train
#--train model
#--evalute the trained model

In [2]:
#--import drive to mount the dataset

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
#--import required libraries

import pandas as pd
import re

In [4]:
#--download nltk libraries
import nltk

# nltk.download()

In [19]:
#--import nltk packages
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [5]:
#--read dataset
data_path="/content/gdrive/MyDrive/Learning_AI/NLP/dataset/stock_price_sentiment_data.csv"
data=pd.read_csv(data_path,encoding='ISO-8859-1')
data.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite


In [6]:
#--first column is data
#--second column is class label
#--columns 3-27 are headlines

In [9]:
interest_col_list=list(data.columns[2:])
# interest_col_list

In [10]:
#--text preprocessing

#--for a given dataframe and list of columns, apply general text-preprocessing
#--general text preprocessing
#---remove unwanted words
#---lower the sentences
#---remove stop wards

def df_text_preprocessing(df,col_list):

  '''
  df: input dataframe
  col_list: list of columns on which preprocessing needed
  df: output dataframe with modified text
  '''
  df[col_list] = df[col_list].apply(lambda x: x.str.lower())

  for column in col_list:
    df[column].replace('[^a-zA-Z]', ' ', regex=True, inplace=True)

  return df

In [11]:
df=df_text_preprocessing(data,interest_col_list)
df.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,...,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title
1,2000-01-04,0,scorecard,the best lake scene,leader german sleaze inquiry,cheerio boyo,the main recommendations,has cubie killed fees,has cubie killed fees,has cubie killed fees,...,on the critical list,the timing of their lives,dear doctor,irish court halts ira man s extradition to nor...,burundi peace initiative fades after rebels re...,pe points the way forward to the ecb,campaigners keep up pressure on nazi war crime...,jane ratcliffe,yet more things you wouldn t know without the ...,millennium bug fails to bite
2,2000-01-05,0,coventry caught on counter by flo,united s rivals on the road to rio,thatcher issues defence before trial by video,police help smith lay down the law at everton,tale of trautmann bears two more retellings,england on the rack,pakistan retaliate with call for video of walsh,cullinan continues his cape monopoly,...,south melbourne australia,necaxa mexico,real madrid spain,raja casablanca morocco,corinthians brazil,tony s pet project,al nassr saudi arabia,ideal holmes show,pinochet leaves hospital after tests,useful links
3,2000-01-06,1,pilgrim knows how to progress,thatcher facing ban,mcilroy calls for irish fighting spirit,leicester bin stadium blueprint,united braced for mexican wave,auntie back in fashion even if the dress look...,shoaib appeal goes to the top,hussain hurt by shambles but lays blame on e...,...,putin admits yeltsin quit to give him a head s...,bbc worst hit as digital tv begins to bite,how much can you pay for,christmas glitches,upending a table chopping a line and scoring ...,scientific evidence unreliable defence claims,fusco wins judicial review in extradition case,rebels thwart russian advance,blair orders shake up of failing nhs,lessons of law s hard heart
4,2000-01-07,1,hitches and horlocks,beckham off but united survive,breast cancer screening,alan parker,guardian readers are you all whingers,hollywood beyond,ashes and diamonds,whingers a formidable minority,...,most everywhere udis,most wanted chloe lunettes,return of the cane completely off the agenda,from sleepy hollow to greeneland,blunkett outlines vision for over s,embattled dobson attacks play now pay later ...,doom and the dome,what is the north south divide,aitken released from jail,gone aloft


In [13]:
#--split data into train and test
df["Date"] = pd.to_datetime(df["Date"], format="%Y/%m/%d")
train_df=df[df["Date"].dt.year<2011]
test_df=df[df["Date"].dt.year>2011]

In [17]:
#--prepare single text dataset for train dataset
train_headlines=[]

for row in range(0,len(train_df.index)):
  train_headlines.append(' '.join(str(x) for x in train_df.iloc[row,2:27]))
train_headlines[0]

'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

In [18]:
#--inference with testdata
test_headlines=[]

for row in range(0,len(test_df.index)):
  test_headlines.append(' '.join(str(x) for x in test_df.iloc[row,2:27]))
test_headlines[0]



In [20]:
#--initialize model
countvector=CountVectorizer(ngram_range=(2,2))

traindataset=countvector.fit_transform(train_headlines) #--train headlines vectorization
testdataset=countvector.transform(test_headlines) #--test headlines vectorization


In [21]:
#--random classifier
model=RandomForestClassifier(n_estimators=200,criterion='entropy')
model.fit(traindataset,train_df['Label'])

In [22]:
#--perform inference on test dataset
y_pred=model.predict(testdataset)

In [23]:
#--evaluate the model performance on test dataset
from sklearn.metrics import accuracy_score
acc=accuracy_score(test_df['Label'],y_pred)

In [24]:
print("accuracy :",acc)

accuracy : 0.5176678445229682
