In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection  import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import tokenize
import re
from unidecode import unidecode
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm

In [22]:
STOP_WORDS = nltk.corpus.stopwords.words("english")
MAX_FEATURES = [50, None, 500, 10000]
SOLVERS = ['lbfgs','liblinear']

In [23]:
def load_df():
    df = pd.read_csv('./data/financial_sentiments.csv')
    return df

df = load_df()
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [24]:
def add_stop_words(*args):
    for arg in args:
        STOP_WORDS.append(arg)
    return STOP_WORDS
add_stop_words();

In [25]:
def normalize_sentence(sentence):
    norm_sentence = sentence.lower()
    norm_sentence = re.sub(r'[^\w\s]','', norm_sentence) 
    norm_sentence = norm_sentence.strip() 
    norm_sentence = unidecode(norm_sentence)
    norm_sentence = ' '.join(norm_sentence.split()) 
    return norm_sentence
white_space_tokenize = tokenize.WhitespaceTokenizer()  
phrase_pos = list()
col = 'Sentence'
new_col = 'treatment_1'
for text in tqdm(df[col]):
    new_phrase = list()
    phrase_text = white_space_tokenize.tokenize(
        normalize_sentence(text)
    )
    for word in phrase_text:
        if word not in STOP_WORDS:
            stemmer = nltk.RSLPStemmer()
            norm_word = stemmer.stem(word)
            new_phrase.append(norm_word)

    phrase_pos.append(' '.join(new_phrase))
df[new_col] = phrase_pos

100%|███████████████████████████████████████| 5842/5842 [01:03<00:00, 91.64it/s]


In [26]:
df.head()

Unnamed: 0,Sentence,Sentiment,treatment_1
0,The GeoSolutions technology will leverage Bene...,positive,geosolutiom technology leverag benefon gp solu...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,esi low 150 250 bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",positive,last quart 2010 component net sal doubled eur1...
3,According to the Finnish-Russian Chamber of Co...,neutral,according finnishrussian chamb commerc maj con...
4,The Swedish buyout firm has sold its remaining...,neutral,swedish buyout firm sold remaining 224 percent...


In [27]:
def str2int(df):    
    df['Sentiment'] = df['Sentiment'].replace(['negative','neutral','positive'],[-1,0,1])
    return df
df = str2int(df)
df.head()

Unnamed: 0,Sentence,Sentiment,treatment_1
0,The GeoSolutions technology will leverage Bene...,1,geosolutiom technology leverag benefon gp solu...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",-1,esi low 150 250 bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",1,last quart 2010 component net sal doubled eur1...
3,According to the Finnish-Russian Chamber of Co...,0,according finnishrussian chamb commerc maj con...
4,The Swedish buyout firm has sold its remaining...,0,swedish buyout firm sold remaining 224 percent...


In [59]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(df['Sentence'].to_list())

In [60]:
train, test, class_train, class_test =\
        train_test_split(bag_of_words, df['Sentiment'], test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(solver='liblinear')
logistic_regression.fit(train, class_train)
logistic_regression.score(test, class_test)

0.69803250641574