In [73]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

import re
import string
import nltk
from nltk.stem import WordNetLemmatizer

In [9]:
input_data = pd.read_csv(r'C:\Users\Patrick\Documents\GitHub\bootcamp_capstone\kaggle_dataset\sentiment_analysis_financial_news\all-data.csv'
                , encoding = "ISO-8859-1", header=None, names=['sentiment', 'text'])
                         
input_data.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [15]:
#install nltk package
import sys
!{sys.executable} -m pip install nltk



In [18]:
#download the necessary data 
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
#Data Cleaning
#first, remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()


In [22]:
#data cleaning preprocessing
pattern = r'[^a-zA-Z0-9\s\%]'
cleaned_buffer = []
for x in input_data['text']:
    temp = re.sub(pattern, " ", x)
    temp = temp.lower()
    temp = temp.split()
    temp = [lemmatizer.lemmatize(word) for word in temp if not word in set(stopwords)]
    temp = ' '.join(temp)
    cleaned_buffer.append(temp)



In [23]:
cleaned_buffer

['according gran company plan move production russia although company growing',
 'technopolis plan develop stage area le 100 000 square meter order host company working computer technology telecommunication statement said',
 'international electronic industry company elcoteq laid ten employee tallinn facility contrary earlier layoff company contracted rank office worker daily postimees reported',
 'new production plant company would increase capacity meet expected increase demand would improve use raw material therefore increase production profitability',
 'according company updated strategy year 2009 2012 basware target long term net sale growth range 20 % 40 % operating profit margin 10 % 20 % net sale',
 'financing aspocomp growth aspocomp aggressively pursuing growth strategy increasingly focusing technologically demanding hdi printed circuit board pcbs',
 'last quarter 2010 componenta net sale doubled eur131m eur76m period year earlier moved zero pre tax profit pre tax loss eur7m'

In [25]:
input_data['cleaned'] = cleaned_buffer
input_data.head()

Unnamed: 0,sentiment,text,cleaned
0,neutral,"According to Gran , the company has no plans t...",according gran company plan move production ru...
1,neutral,Technopolis plans to develop in stages an area...,technopolis plan develop stage area le 100 000...
2,negative,The international electronic industry company ...,international electronic industry company elco...
3,positive,With the new production plant the company woul...,new production plant company would increase ca...
4,positive,According to the company 's updated strategy f...,according company updated strategy year 2009 2...


In [68]:
#split into training and test data sets
xtrain, xtest, ytrain, ytest = train_test_split( input_data['cleaned'], input_data['sentiment'],
                                                               test_size=.4, random_state=10)
xtrain


40      january september 2010 fiskars net profit went...
3232    divested stake represented 2 7 share okmetic c...
2665                           payment date march 25 2010
298     increase capital stock registered finnish trad...
2748    finnish power company fortum report 89 % elect...
                              ...                        
1180    august 2008 glaston north asian sale service r...
3441    upm talking myllykoski creditor bank nordea st...
1344    swedish finnish danish listed company organize...
4623    12 59 pm omx helsinki 25 index 0 32 pct lower ...
1289    nasdaq listed yahoo inc introduced new service...
Name: cleaned, Length: 2907, dtype: object

In [79]:
tfidf = TfidfVectorizer()
xtrain_tf = tfidf.fit_transform(xtrain)
print("nsamples: %d, nfeatures: %d" % xtrain_tf.shape)

xtest_tf = tfidf.transform(xtest)
print("nsamples: %d, nfeatures: %d" % xtest_tf.shape)

nsamples: 2907, nfeatures: 6949
nsamples: 1939, nfeatures: 6949


In [80]:
print(xtest_tf)

  (0, 6909)	0.1914503052635252
  (0, 6078)	0.32100327615970425
  (0, 6039)	0.44827313714470796
  (0, 4925)	0.24998025301553933
  (0, 4476)	0.3668638764938488
  (0, 3686)	0.3668638764938488
  (0, 793)	0.44827313714470796
  (0, 313)	0.3560024321380717
  (1, 5935)	0.308757806198724
  (1, 5888)	0.42036785612514077
  (1, 4454)	0.24042969174795623
  (1, 4185)	0.42036785612514077
  (1, 4042)	0.4363032188408298
  (1, 2701)	0.3696124335122626
  (1, 148)	0.4080074285268764
  (2, 5583)	0.3434740615744535
  (2, 4380)	0.44449510988201746
  (2, 4129)	0.304581457607818
  (2, 3644)	0.37492404779295263
  (2, 1713)	0.3450543580845227
  (2, 1409)	0.37492404779295263
  (2, 1307)	0.4375856587435108
  (3, 6752)	0.44671654490896723
  (3, 6592)	0.169423619617309
  (3, 6076)	0.1685222983982268
  :	:
  (1936, 728)	0.1662556317871433
  (1936, 650)	0.3240813051240628
  (1936, 598)	0.24027425172268413
  (1937, 5653)	0.3912170409068306
  (1937, 4643)	0.4289663897159921
  (1937, 4411)	0.2936363898085054
  (1937, 398

In [81]:
#bayes classification
nb_classify = MultinomialNB()
nb_classify.fit(xtrain_tf, ytrain)

MultinomialNB()

In [82]:
predictions = nb_classify.predict(xtest_tf)

In [85]:
results = metrics.classification_report(ytest, predictions)
print(results)

              precision    recall  f1-score   support

    negative       1.00      0.02      0.03       248
     neutral       0.67      0.98      0.80      1148
    positive       0.62      0.29      0.39       543

    accuracy                           0.66      1939
   macro avg       0.76      0.43      0.41      1939
weighted avg       0.70      0.66      0.59      1939



In [86]:
print(metrics.confusion_matrix(ytest, predictions))

[[   4  168   76]
 [   0 1128   20]
 [   0  388  155]]
