### Import Libraries

In [1]:
import re
import numpy as np
import pandas as pd
import spacy
import string
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split



In [2]:
#lemmatization
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [3]:
df = pd.read_csv("/kaggle/input/financial-sentiment-analysis/data.csv")
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [4]:
df.shape

(5842, 2)

In [5]:
df.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

In [6]:
label_encoder=preprocessing.LabelEncoder()
df['Sentiment']=label_encoder.fit_transform(df["Sentiment"])
df[['Sentiment']]

Unnamed: 0,Sentiment
0,2
1,0
2,2
3,1
4,1
...,...
5837,0
5838,1
5839,1
5840,1


### Text Preprocessing

In [7]:
from string import punctuation, digits
import re
replacements = {"they're": "they are","she's":"she is","he's":"he is","doesn't":"does not","it's": "it is","that's": "that is","we're": "we are","you're":"you are","i'm":"i am","don't":"do not","what's":"what is"}
def preprocess(text): 
    text = re.compile(r'https?://\S+|www\.\S+').sub('', text)#links removal
    text = text.lower() #lowercase removal
    text = re.sub(r'\d','',text) #digit removal
    text = re.compile(r'\s+').sub(' ',text) #white space removal
#     text=re.compile(r'\b[a-zA-Z]+\b').sub('',text)
    for old, new in replacements.items():
        text = text.replace(old,new)
    text=re.compile(r'[^\w\s?.,]').sub('',text)#remove all punctuation except  . , ? 
    return text

In [8]:
df['Sentence'] = df['Sentence'].apply(preprocess) # calling the preprocess for cleaning the data

In [9]:
df['Sentence']

0       the geosolutions technology will leverage bene...
1          esi on lows, down . to . bk a real possibility
2       for the last quarter of , componenta s net sal...
3       according to the finnishrussian chamber of com...
4       the swedish buyout firm has sold its remaining...
                              ...                        
5837    rising costs have forced packaging producer hu...
5838    nordic walking was first used as a summer trai...
5839    according shipping company viking line , the e...
5840    in the building and home improvement trade , s...
5841    helsinki afx  kci konecranes said it has won a...
Name: Sentence, Length: 5842, dtype: object

#### TOKENIZATION
 

In [10]:
from nltk import sent_tokenize, word_tokenize

sentences = df['Sentence'].tolist()
list = [word_tokenize(sentence) for sentence in sentences]


# IF-IDF


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfModel = TfidfVectorizer()
tfidfModel.fit_transform(df['Sentence']).todense()
# tfidfModel.vocabulary_

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
tfidf_df = pd.DataFrame(tfidfModel.fit_transform(df['Sentence']).todense())
tfidf_df.columns = sorted(tfidfModel.vocabulary_)
tfidf_df

Unnamed: 0,aa,aal,aaland,aalto,aaltonen,aapl,aaron,aava,aazhang,ab,...,zurich,zxx,àkersberga,àland,àlandsbanken,àmñl,ál,áá,âm,äñnekoski
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Split the data

In [13]:
x=tfidf_df
y=df[['Sentiment']]

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=20)

### Model train

In [15]:
from xgboost import XGBClassifier
model = XGBClassifier(learning_rate=0.01,max_depth=4,gamma=0.1)
model.fit(x_train, y_train)
model.score(x_test,y_test)*100 

62.35025670279521

In [16]:
pred = model.predict(x_test)
pred

array([1, 2, 1, ..., 1, 0, 1])

In [17]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
error = mean_absolute_error(pred,y_test)*100
error#error


41.985168282943526

# Word-Embedding

In [18]:
import pandas as pd
import gensim

In [19]:
modelw = gensim.models.Word2Vec(
    window=5, # it takes 5 words before and 5 words ahead for training
    min_count=3, # consider at least 3 words as sentence
    workers=4, # cpu threads can be used 
)

In [20]:
modelw.build_vocab(list,
                  progress_per=500) # Indicates how many words to process before showing/updating the progress

In [21]:
vocabulary_wm=modelw.wv.key_to_index
# vocabulary_wm

In [22]:
loveidx = modelw.wv.key_to_index["love"]
loveidx

1951

In [23]:
modelw.save("finacial_word_embedding.model")
modelw

<gensim.models.word2vec.Word2Vec at 0x7f52ae5fd150>

In [24]:
# define default vector function
def default_vector():
    return np.zeros(100)

In [25]:
vectors = [modelw.wv[word] if word in modelw.wv else default_vector() for word in df['Sentence']]


In [26]:
x=vectors
y=df[['Sentiment']]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=20)

In [27]:
from xgboost import XGBClassifier
modelwe = XGBClassifier(learning_rate=0.01,max_depth=4,gamma=0.1)
modelwe.fit(x_train, y_train)

In [28]:
modelwe.score(x_test,y_test)*100 

53.45122646891044

### Similar words

In [29]:
modelw.wv.most_similar("idea")

[('agreement', 0.33862677216529846),
 ('preparation', 0.32011640071868896),
 ('group', 0.31751748919487),
 ('ctrp', 0.3138306736946106),
 ('trucks', 0.31173962354660034),
 ('stage', 0.3072405159473419),
 ('amazon', 0.2849745452404022),
 ('introduced', 0.2807348370552063),
 ('highquality', 0.2792782485485077),
 ('usdm', 0.2785740792751312)]

In [30]:
modelw.wv.most_similar("loss")

[('md', 0.36235547065734863),
 ('neutral', 0.31857889890670776),
 ('attractive', 0.29849889874458313),
 ('combining', 0.2910948693752289),
 ('markets', 0.28621193766593933),
 ('safety', 0.2854600250720978),
 ('proposed', 0.28101083636283875),
 ('hollola', 0.27684640884399414),
 ('invite', 0.2721647620201111),
 ('yield', 0.27205002307891846)]

In [31]:
modelw.wv.most_similar("environment")

[('left', 0.3781757652759552),
 ('computer', 0.3010169267654419),
 ('traffic', 0.29818445444107056),
 ('iittala', 0.29473623633384705),
 ('streamlining', 0.29320284724235535),
 ('what', 0.2905164361000061),
 ('return', 0.28365930914878845),
 ('pori', 0.2814423143863678),
 ('lithuanian', 0.27999362349510193),
 ('mall', 0.27742883563041687)]

In [32]:
modelw.wv.most_similar("agreement")

[('usdm', 0.3656705617904663),
 ('idea', 0.3386267423629761),
 ('locations', 0.30230605602264404),
 ('upgrades', 0.2977310121059418),
 ('came', 0.29006099700927734),
 ('exposure', 0.2882782518863678),
 ('need', 0.28547927737236023),
 ('poor', 0.28400370478630066),
 ('present', 0.2824138402938843),
 ('next', 0.27831143140792847)]

In [33]:
modelw.wv.most_similar("comparable")

[('expert', 0.3922279477119446),
 ('frost', 0.37223923206329346),
 ('august', 0.34374096989631653),
 ('directed', 0.33894771337509155),
 ('residential', 0.33276766538619995),
 ('launch', 0.3118973672389984),
 ('effect', 0.3063986897468567),
 ('immediate', 0.30302029848098755),
 ('esl', 0.3015299141407013),
 ('any', 0.2965189218521118)]

# BAG-OF-WORDS

In [43]:
bow_model = CountVectorizer()

In [46]:
bow_df = pd.DataFrame(bow_model.fit_transform(sentences).todense())
bow_df.columns = sorted(bow_model.vocabulary_)
bow_df

Unnamed: 0,aa,aal,aaland,aalto,aaltonen,aapl,aaron,aava,aazhang,ab,...,zurich,zxx,àkersberga,àland,àlandsbanken,àmñl,ál,áá,âm,äñnekoski
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5838,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
df.columns

Index(['Sentence', 'Sentiment', 'predicted_labels'], dtype='object')

In [48]:
#split the data into x and y
x=bow_df
y=df[['Sentiment']]

In [49]:
from sklearn import tree
clf= tree.DecisionTreeClassifier()
clf= clf.fit(x,y)
predicted_labels = clf.predict(x)


In [50]:
clf.predict_proba(bow_df)[:,1]

array([0. , 0. , 0. , ..., 1. , 0.5, 0. ])

In [51]:
df['predicted_labels'] = predicted_labels

In [52]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(df['Sentiment'], df['predicted_labels'])*100
print("Accuracy:", accuracy)

Accuracy: 91.18452584731257


In [53]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
error = mean_absolute_error(df['Sentiment'],df['predicted_labels'])*100
error#error


8.832591578226635