# Bag-Of-Words Baseline Model

In this notebook, we will be using a Bag-of-Words model as a baseline sentiment model. 

In [20]:
import numpy as np
import pandas as pd
import os
import pickle
import string

In [84]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaskrenn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
from string import punctuation
from os import listdir
from collections import Counter
#from keras.preprocessing.text import Tokenizer

In [49]:
with open('../Step1-Data/4-link_dict.pickle', 'rb') as f:
    parsed = pickle.load(f)

In [62]:
f = open("1-Sentences_75Agree.txt", 'r', encoding = "ISO-8859-1")
training_string = f.read()
training_string[:500]

"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral\nWith the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .@positive\nFor the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero "

In [70]:
training = training_string.split('\n')
training[:5]

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral',
 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .@positive',
 "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .@positive",
 'In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .@positive',
 'Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .@positive']

In [71]:
training = [x.split(' .@') for x in training]
training[0]

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing',
 'neutral']

In [73]:
train_df = pd.DataFrame(training, columns = ['sentence', 'sentiment'])
train_df.head()

Unnamed: 0,sentence,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,With the new production plant the company woul...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,"In the third quarter of 2010 , net sales incre...",positive
4,Operating profit rose to EUR 13.1 mn from EUR ...,positive


In [76]:
positive = train_df[train_df['sentiment'] == 'positive']
positive.head()

Unnamed: 0,sentence,sentiment
1,With the new production plant the company woul...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,"In the third quarter of 2010 , net sales incre...",positive
4,Operating profit rose to EUR 13.1 mn from EUR ...,positive
5,"Operating profit totalled EUR 21.1 mn , up fro...",positive


In [78]:
len(positive)

862

In [80]:
negative = train_df[train_df['sentiment'] == 'negative']
negative.head()

Unnamed: 0,sentence,sentiment
393,Jan. 6 -- Ford is struggling in the face of sl...,negative
396,Pharmaceuticals group Orion Corp reported a fa...,negative
397,"However , the growth margin slowed down due to...",negative
482,2009 3 February 2010 - Finland-based steel mak...,negative
484,Result before taxes decreased to nearly EUR 14...,negative


In [82]:
len(negative)

405

# Tokenize the Training Data

In this section, I will be borrowing heavily from the following bag-of-words model tutorial: 

https://machinelearningmastery.com/deep-learning-bag-of-words-model-sentiment-analysis/

In [114]:
tokens = ' '.join(positive['sentence'].values).split()
# remove punctuation from each token
table = str.maketrans('', '', punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

In [119]:
tokens[:10]

['With',
 'new',
 'production',
 'plant',
 'company',
 'would',
 'increase',
 'capacity',
 'meet',
 'expected']

In [116]:
# define vocab
vocab = Counter()

In [117]:
vocab.update(tokens)

In [118]:
# print the top words in the vocab
print(vocab.most_common(50))

[('EUR', 356), ('mn', 239), ('profit', 170), ('The', 165), ('net', 158), ('sales', 146), ('said', 144), ('Finnish', 140), ('period', 136), ('million', 131), ('company', 130), ('year', 123), ('mln', 115), ('quarter', 103), ('rose', 85), ('increased', 81), ('compared', 68), ('loss', 67), ('operating', 64), ('corresponding', 64), ('Oyj', 59), ('percent', 58), ('increase', 57), ('share', 56), ('euro', 53), ('first', 52), ('Operating', 50), ('eur', 50), ('per', 42), ('market', 42), ('In', 41), ('Finland', 41), ('also', 39), ('operations', 38), ('HEL', 37), ('contract', 37), ('group', 37), ('new', 36), ('business', 35), ('services', 35), ('earlier', 34), ('grew', 32), ('today', 31), ('order', 30), ('third', 29), ('Group', 29), ('second', 29), ('last', 27), ('agreement', 26), ('yearonyear', 26)]
