In [22]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("financial sentiment analysis.csv")
data.head(50)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
5,$SPY wouldn't be surprised to see a green close,positive
6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
7,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative
8,Kone 's net sales rose by some 14 % year-on-ye...,positive
9,The Stockmann department store will have a tot...,neutral


In [3]:
data["Sentence"][0]

"The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model ."

In [4]:
data["Sentence"][1]

'$ESI on lows, down $1.50 to $2.50 BK a real possibility'

In [5]:
data["Sentence"][3]

'According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia .'

In [6]:
data["Sentiment"].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

In [7]:
data.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

# DATA PREPROCESSING

In [8]:
import string
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [9]:
def preprocessing(text):
    #TOKENISATION
    tokens = word_tokenize(text)
    
    #REMOVING PUNCTUATIONS
    punctuations = string.punctuation
    txt = []
    for i in tokens:
        if i not in punctuations:
            txt.append(i)
            
    #REMOVING STOPWORDS & CONVERTING TO LOWERCASE
    stopword = stopwords.words("english")
    ntxt = []
    for j in txt:
        if j not in stopword:
            ntxt.append(j.lower())
    
    #STEMMING
    stem = []
    Stemmer = PorterStemmer()
    for i in ntxt:
        stem.append(Stemmer.stem(i))
    return " ".join(stem)

In [10]:
data["Sentence"] = data["Sentence"].apply(preprocessing)

In [11]:
data["Sentence"][55]

'the price log clearli improv 2009 also price pulpwood gone'

In [12]:
data.head(10)

Unnamed: 0,Sentence,Sentiment
0,the geosolut technolog leverag benefon 's gp s...,positive
1,esi low 1.50 2.50 bk real possibl,negative
2,for last quarter 2010 componenta 's net sale d...,positive
3,accord finnish-russian chamber commerc major c...,neutral
4,the swedish buyout firm sold remain 22.4 perce...,neutral
5,spi would n't surpris see green close,positive
6,shell 's 70 billion bg deal meet sharehold ske...,negative
7,ssh commun secur corp stock exchang releas oct...,negative
8,kone 's net sale rose 14 year-on-year first ni...,positive
9,the stockmann depart store total floor space 8...,neutral


In [13]:
data.isna().sum()

Sentence     0
Sentiment    0
dtype: int64

# MODEL BUILDING & TEXT VECTORIZATION

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score

In [15]:
x = data["Sentence"]
y = data["Sentiment"]

In [16]:
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(x).toarray()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)
model = RandomForestClassifier()
model.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test,pred)*100

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)
model2 = GaussianNB()
model2.fit(x_train,y_train)
pred = model2.predict(x_test)
accuracy_score(y_test,pred)*100

In [23]:
score = []
for i in range(10):
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    svc = SVC(kernel="linear",tol=0.1)
    svc.fit(x_train,y_train)
    pred = svc.predict(x_test)
    score.append(accuracy_score(y_test,pred))
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  4
0.7091531223267751


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=4)
model3 = SVC(kernel="linear",tol=0.1)
model3.fit(x_train,y_train)
pred = model3.predict(x_test)
accuracy_score(y_test,pred)*100

70.91531223267751