In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import string
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [19]:
data = pd.read_csv("financial sentiment analysis.csv")

# Data Understanding

In [20]:
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [21]:
data.shape

(5842, 2)

In [22]:
data.sample(10)

Unnamed: 0,Sentence,Sentiment
259,Pretax loss totaled EUR 117mn compared to a lo...,neutral
3484,"Also , Technopolis plans to build a 100 millio...",neutral
4136,Severn Trent share price jumps as Canadian inv...,positive
5690,The outsourced Scan and Capture solutions tran...,neutral
2423,"The dividend will be paid on April 15 , 2008 t...",neutral
2910,The financial details of the transaction were ...,neutral
3281,Tampere Science Parks is a Finnish company tha...,neutral
3708,These savings will have full impact as of the ...,neutral
2150,Export accounts for about one tenth of the com...,neutral
5492,Operating profit fell to EUR 23.26 mn from EUR...,neutral


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [24]:
data.isnull().mean()*100

Sentence     0.0
Sentiment    0.0
dtype: float64

# Data Cleaning / Preprocessing

In [80]:
data["Sentence"][0]

"the geosolut technolog leverag benefon 's gp solut provid locat base search technolog commun platform locat relev multimedia content new power commerci model"

In [26]:
data["Sentiment"].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

In [28]:
#TOKENISATION AND REMOVING PUNCTUATIONS

def preprocessing1(text):
    tokens = word_tokenize(text)
    
    punctuations = string.punctuation
    txt = []
    for i in tokens:
        if i not in punctuations:
            txt.append(i)
    return txt

In [29]:
data["Sentence"] = data["Sentence"].apply(preprocessing1)

In [30]:
data["Sentence"][0]

['The',
 'GeoSolutions',
 'technology',
 'will',
 'leverage',
 'Benefon',
 "'s",
 'GPS',
 'solutions',
 'by',
 'providing',
 'Location',
 'Based',
 'Search',
 'Technology',
 'a',
 'Communities',
 'Platform',
 'location',
 'relevant',
 'multimedia',
 'content',
 'and',
 'a',
 'new',
 'and',
 'powerful',
 'commercial',
 'model']

In [31]:
#REMOVING STOPWORDS & CONVERTING TO LOWERCASE

def preprocessing2(text):
    stopword = stopwords.words("english")
    ntxt = []
    for j in text:
        if j not in stopword:
            ntxt.append(j.lower())
    return ntxt

In [32]:
data["Sentence"] = data["Sentence"].apply(preprocessing2)

In [33]:
data["Sentence"][0]

['the',
 'geosolutions',
 'technology',
 'leverage',
 'benefon',
 "'s",
 'gps',
 'solutions',
 'providing',
 'location',
 'based',
 'search',
 'technology',
 'communities',
 'platform',
 'location',
 'relevant',
 'multimedia',
 'content',
 'new',
 'powerful',
 'commercial',
 'model']

In [34]:
#STEMMING

def preprocessing3(text):
    stem = []
    Stemmer = PorterStemmer()
    for i in text:
        stem.append(Stemmer.stem(i))
    return " ".join(stem)

In [35]:
data["Sentence"] = data["Sentence"].apply(preprocessing3)

In [36]:
data["Sentence"][0]

"the geosolut technolog leverag benefon 's gp solut provid locat base search technolog commun platform locat relev multimedia content new power commerci model"

In [37]:
data.head(10)

Unnamed: 0,Sentence,Sentiment
0,the geosolut technolog leverag benefon 's gp s...,positive
1,esi low 1.50 2.50 bk real possibl,negative
2,for last quarter 2010 componenta 's net sale d...,positive
3,accord finnish-russian chamber commerc major c...,neutral
4,the swedish buyout firm sold remain 22.4 perce...,neutral
5,spi would n't surpris see green close,positive
6,shell 's 70 billion bg deal meet sharehold ske...,negative
7,ssh commun secur corp stock exchang releas oct...,negative
8,kone 's net sale rose 14 year-on-year first ni...,positive
9,the stockmann depart store total floor space 8...,neutral


In [38]:
data.isna().sum()

Sentence     0
Sentiment    0
dtype: int64

# MODEL BUILDING

In [39]:
x = data["Sentence"]
y = data["Sentiment"]

In [40]:
le = LabelEncoder()
y = le.fit_transform(y)

In [42]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(x).toarray()

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)

**Logistic Regression**

In [45]:
logr = LogisticRegression()
model = logr.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

69.88879384088965

In [50]:
score = []
for i in range(20):
    x = data["Sentence"]
    y = data["Sentiment"]
    x = tfidf.fit_transform(x).toarray()
    y = le.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    logr = LogisticRegression()
    logr.fit(x_train, y_train)
    pred = logr.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

i=  11
71.08639863130881


In [51]:
logr = LogisticRegression()
x = data["Sentence"]
y = data["Sentiment"]
x = tfidf.fit_transform(x).toarray()
y = le.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model1 = logr.fit(x_train,y_train)
pred = model1.predict(x_test)
accuracy_score(y_test, pred)*100

71.08639863130881

In [52]:
# Training Data Accuracy

pred = model1.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

83.60796062486625

In [56]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod1 = cross_val_score(LogisticRegression(), x_train, y_train, cv=10)
print(mod1)
print(np.round(np.mean(mod1),2)*100)

[0.70512821 0.6965812  0.67307692 0.69164882 0.68522484 0.67880086
 0.70877944 0.68094218 0.67665953 0.70235546]
69.0


**Decision Tree Classifier**

In [57]:
dtc = DecisionTreeClassifier()
model = dtc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

59.53806672369547

In [58]:
score = []
for i in range(50):
    x = data["Sentence"]
    y = data["Sentiment"]
    x = tfidf.fit_transform(x).toarray()
    y = le.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    dtc = DecisionTreeClassifier()
    dtc.fit(x_train, y_train)
    pred = dtc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  16
62.01881950384944


In [59]:
dtc = DecisionTreeClassifier()
x = data["Sentence"]
y = data["Sentiment"]
x = tfidf.fit_transform(x).toarray()
y = le.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model2 = dtc.fit(x_train,y_train)
pred = model2.predict(x_test)
accuracy_score(y_test, pred)*100

61.24893071000855

In [60]:
# Training Data Accuracy

pred = model2.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

93.06655253584421

In [61]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod2 = cross_val_score(DecisionTreeClassifier(), x_train, y_train, cv=5)
print(mod2)
print(np.round(np.mean(mod2),2)*100)

[0.56470588 0.59786096 0.60320856 0.57922912 0.62526767]
59.0


**RandomForest Classifier**

In [62]:
rfc = RandomForestClassifier()
model = rfc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

65.098374679213

In [63]:
score = []
for i in range(10):
    x = data["Sentence"]
    y = data["Sentiment"]
    x = tfidf.fit_transform(x).toarray()
    y = le.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    rfc = RandomForestClassifier()
    rfc.fit(x_train, y_train)
    pred = rfc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  4
67.57912745936699


In [64]:
rfc = RandomForestClassifier()
x = data["Sentence"]
y = data["Sentiment"]
x = tfidf.fit_transform(x).toarray()
y = le.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model3 = rfc.fit(x_train,y_train)
pred = model3.predict(x_test)
accuracy_score(y_test, pred)*100

67.15141146278872

In [65]:
# Training Data Accuracy

pred = model3.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

92.72416006847848

In [66]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod3 = cross_val_score(RandomForestClassifier(), x_train, y_train, cv=4)
print(mod3)
print(np.round(np.mean(mod3),2)*100)

[0.64841745 0.63356164 0.65239726 0.63869863]
64.0


**Gausian NB Classifier**

In [67]:
gnb = GaussianNB()
model = gnb.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

48.75962360992301

In [68]:
score = []
for i in range(100):
    x = data["Sentence"]
    y = data["Sentiment"]
    x = tfidf.fit_transform(x).toarray()
    y = le.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    pred = gnb.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  63
49.700598802395206


In [69]:
gnb = GaussianNB()
x = data["Sentence"]
y = data["Sentiment"]
x = tfidf.fit_transform(x).toarray()
y = le.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model4 = gnb.fit(x_train,y_train)
pred = model4.predict(x_test)
accuracy_score(y_test, pred)*100

49.700598802395206

In [70]:
# Training Data Accuracy

pred = model4.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

76.93130751123475

In [72]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod4 = cross_val_score(GaussianNB(), x_train, y_train, cv=10)
print(mod4)
print(np.round(np.mean(mod4),2)*100)

[0.44871795 0.44871795 0.42735043 0.44753747 0.44325482 0.46252677
 0.44753747 0.46680942 0.47751606 0.46038544]
45.0


**Support Vector Classifier**

In [75]:
svc = SVC(kernel="linear",tol=0.1)
model = svc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

68.43455945252353

In [76]:
score = []
for i in range(5):
    x = data["Sentence"]
    y = data["Sentiment"]
    x = tfidf.fit_transform(x).toarray()
    y = le.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    svc = SVC(kernel="linear",tol=0.1)
    svc.fit(x_train, y_train)
    pred = svc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  4
70.91531223267751


In [77]:
svc = SVC(kernel="linear",tol=0.1)
x = data["Sentence"]
y = data["Sentiment"]
x = tfidf.fit_transform(x).toarray()
y = le.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model5 = svc.fit(x_train,y_train)
pred = model5.predict(x_test)
accuracy_score(y_test, pred)*100

70.91531223267751

In [78]:
# Training Data Accuracy

pred = model5.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

86.21870318852986

In [79]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod5 = cross_val_score(SVC(kernel="linear",tol=0.1), x_train, y_train, cv=4)
print(mod5)
print(np.round(np.mean(mod5),2)*100)

[0.68006843 0.6875     0.68921233 0.69777397]
69.0
