# Text Data Creation - Preparation for Sentiment Model Training

In [2]:
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import string
import spacy
import re
import multiprocessing as mp
from sklearn.metrics import classification_report

import nltk
from nltk.stem import PorterStemmer

nlp = spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth',999)

### Load Data

In [3]:
## Load all paths

#importing financial phrase bank
financial_phrasebank_file_name = os.path.join("data",  "all-data.csv")

## different files from Sem Eval Dataset
semeval2_2017_train_file_name  = os.path.join('data','Headline_Trainingdata.json')
semeval2_2017_test_file_name   = os.path.join('data','Headlines_Testdata.json')
semeval2_2017_train_microblog_file_name   = os.path.join('data','Microblog_Trainingdata.json')
semeval2_2017_test_microblog_file_name   = os.path.join('data','Microblogs_Testdata.json')

# semeval2_2017_trial_file_name  = os.path.join('data','Project','Headline_Trialdata.json')

In [4]:
df1 = pd.read_csv(financial_phrasebank_file_name,header=None,names=['label','sentence'])
df1.rename(columns={'label':'sentiment_label'}, inplace=True)
print('Shape of financial phrase bank dataset ', df1.shape[0])
df1['source'] = 'financialphrasebank'
df1.head()

Shape of financial phrase bank dataset  4846


Unnamed: 0,sentiment_label,sentence,source
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",financialphrasebank
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",financialphrasebank
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",financialphrasebank
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,financialphrasebank
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",financialphrasebank


In [5]:
df2_headline_train = pd.read_json(semeval2_2017_train_file_name)
df2_headline_test = pd.read_json(semeval2_2017_test_file_name)
df2_headline = pd.concat([df2_headline_train, df2_headline_test]).reset_index()
df2_headline.rename(columns={'title':'sentence', 'sentiment':'sentiment_score'}, inplace=True)
df2_headline = df2_headline[['sentence','sentiment_score']]
df2_headline['source'] = 'headline'
df2_headline['sentiment_label'] = df2_headline['sentiment_score'].apply(lambda x: 'positive' if x>0 else ('negative' if x<0 else 'neutral' ))
print('Shape of SemEval 2017 Headline bank dataset ', df2_headline.shape[0])
df2_headline.head()


Shape of SemEval 2017 Headline bank dataset  1633


Unnamed: 0,sentence,sentiment_score,source,sentiment_label
0,Morrisons book second consecutive quarter of sales growth,0.43,headline,positive
1,IMI posts drop in first-quarter organic revenue; warns on full year,-0.344,headline,negative
2,"Glencore to refinance its short-term debt early, shares rise",0.34,headline,positive
3,EasyJet attracts more passengers in June but still lags Ryanair,0.259,headline,positive
4,Barclays 'bad bank' chief to step down,-0.231,headline,negative


In [6]:
df2_headline.sentiment_label.value_counts()

positive    653
neutral     529
negative    451
Name: sentiment_label, dtype: int64

### Merging both dataframes to create final dataset

In [7]:
final_df = pd.concat([df1, df2_headline.drop('sentiment_score', axis=1)])
final_df.head()

Unnamed: 0,sentiment_label,sentence,source
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",financialphrasebank
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",financialphrasebank
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",financialphrasebank
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,financialphrasebank
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",financialphrasebank


In [9]:
print('Data label counts in the final dataset')
final_df.sentiment_label.value_counts()

Data label counts in the final dataset


neutral     3408
positive    2016
negative    1055
Name: sentiment_label, dtype: int64

### Split the Data into Train and Test

In [19]:
from sklearn.model_selection import train_test_split
train_set, test_set= train_test_split(final_df,  test_size=0.2, random_state=42)

### Saving the dataframes

In [23]:
train_set.to_csv('data/train_data.csv', index=False)
test_set.to_csv('data/test_data.csv', index=False)