In [1]:
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import string
import spacy
import re
import multiprocessing as mp

nlp = spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth',999)

In [2]:
#importing financial phrase bank
financial_phrasebank_file_name = os.path.join("data",  "all-data.csv")
semeval2_2017_train_file_name  = os.path.join('data','Headline_Trainingdata.json')
semeval2_2017_test_file_name   = os.path.join('data','Headlines_Testdata.json')
semeval2_2017_train_microblog_file_name   = os.path.join('data','Microblog_Trainingdata.json')
semeval2_2017_test_microblog_file_name   = os.path.join('data','Microblogs_Testdata.json')

# semeval2_2017_trial_file_name  = os.path.join('data','Project','Headline_Trialdata.json')

In [3]:
df1 = pd.read_csv(financial_phrasebank_file_name,header=None,names=['label','sentence'])
df1.rename(columns={'label':'sentiment_label'}, inplace=True)
print('Shape of financial phrase bank dataset ', df1.shape[0])
df1['source'] = 'financialphrasebank'
df1.head()

Shape of financial phrase bank dataset  4846


Unnamed: 0,sentiment_label,sentence,source
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",financialphrasebank
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",financialphrasebank
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",financialphrasebank
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,financialphrasebank
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",financialphrasebank


In [5]:
df2_headline_train = pd.read_json(semeval2_2017_train_file_name)
df2_headline_test = pd.read_json(semeval2_2017_test_file_name)
df2_headline = pd.concat([df2_headline_train, df2_headline_test]).reset_index()
df2_headline.rename(columns={'title':'sentence', 'sentiment':'sentiment_score'}, inplace=True)
df2_headline = df2_headline[['sentence','sentiment_score']]
df2_headline['source'] = 'headline'
df2_headline['sentiment_label'] = df2_headline['sentiment_score'].apply(lambda x: 'positive' if x>0 else ('negative' if x<0 else 'neutral' ))
print('Shape of SemEval 2017 Headline bank dataset ', df2_headline.shape[0])
df2_headline.head()


Shape of SemEval 2017 Headline bank dataset  1633


Unnamed: 0,sentence,sentiment_score,source,sentiment_label
0,Morrisons book second consecutive quarter of sales growth,0.43,headline,positive
1,IMI posts drop in first-quarter organic revenue; warns on full year,-0.344,headline,negative
2,"Glencore to refinance its short-term debt early, shares rise",0.34,headline,positive
3,EasyJet attracts more passengers in June but still lags Ryanair,0.259,headline,positive
4,Barclays 'bad bank' chief to step down,-0.231,headline,negative


In [6]:
df2_headline.sentiment_label.value_counts()

positive    653
neutral     529
negative    451
Name: sentiment_label, dtype: int64

In [7]:
# df2_microblog_train = pd.read_json(semeval2_2017_train_microblog_file_name)
# df2_microblog_test = pd.read_json(semeval2_2017_test_microblog_file_name)
# df2_microblog = pd.concat([df2_microblog_train, df2_microblog_test]).reset_index(drop=True)
# df2_microblog.dropna(inplace=True)
# df2_microblog['spans'] = df2_microblog['spans'].apply(lambda x: x if type(x)==str else (x[0] if len(x)==1 else ' '.join(x)))
# df2_microblog.rename(columns={'spans':'sentence','sentiment score':'sentiment_score'}, inplace=True)
# df2_microblog = df2_microblog[['sentence','sentiment_score']]
# df2_microblog['source'] = 'microblog'
# # df2_microblog['spans'] = df2_microblog['spans'].apply(lambda x: if shape_list
# print(df2_microblog.shape)
# df2_microblog.head()

In [8]:
final_df = pd.concat([df1, df2_headline.drop('sentiment_score', axis=1)])
final_df.head()

Unnamed: 0,sentiment_label,sentence,source
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",financialphrasebank
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",financialphrasebank
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",financialphrasebank
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,financialphrasebank
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",financialphrasebank


In [9]:
final_df.sentiment_label.value_counts()

neutral     3408
positive    2016
negative    1055
Name: sentiment_label, dtype: int64

In [10]:
final_df[final_df.sentiment_label=='negative']

Unnamed: 0,sentiment_label,sentence,source
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",financialphrasebank
415,negative,A tinyurl link takes users to a scamming site promising that users can earn thousands of dollars by becoming a Google ( NASDAQ : GOOG ) Cash advertiser .,financialphrasebank
421,negative,"Compared with the FTSE 100 index , which rose 36.7 points ( or 0.6 % ) on the day , this was a relative price change of -0.2 % .",financialphrasebank
423,negative,"Compared with the FTSE 100 index , which rose 94.9 points ( or 1.6 % ) on the day , this was a relative price change of -0.4 % .",financialphrasebank
500,negative,One of the challenges in the oil production in the North Sea is scale formation that can plug pipelines and halt production .,financialphrasebank
...,...,...,...
1129,negative,"Oil majors like Royal Dutch Shell, Chevron, BP fail to find reserves to counter ...",headline
1130,negative,Four ex-Barclays bankers sentenced for roles in Libor rate-rigging scandal,headline
1133,negative,Tesco leads leap in FTSE 100; Marks & Spencer drops,headline
1139,negative,Kingfisher share price slides on cost to implement new strategy,headline


In [155]:
final_df.to_csv('final_df.csv', index=False)

In [167]:
## cleaning functions:
### 1. lower case
final_df['cleaned_sentence'] = final_df['sentence'].apply(lambda x: x.lower())

### 2. remove punctuations
final_df['cleaned_sentence'] = final_df['cleaned_sentence'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

### 3. remove numbers
final_df['cleaned_sentence'] = final_df['cleaned_sentence'].apply(lambda x: re.sub(" \d+", " ", x))

### 4. Clear whitespaces
final_df['cleaned_sentence'] = final_df['cleaned_sentence'].apply(lambda x: re.sub(' +', ' ', x.lstrip().rstrip()))

final_df.head()

Unnamed: 0,sentiment_label,sentence,source,cleaned_sentence
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",financialphrasebank,according to gran the company has no plans to move all production to russia although that is where the company is growing
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .",financialphrasebank,technopolis plans to develop in stages an area of no less than square meters in order to host companies working in computer technologies and telecommunications the statement said
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",financialphrasebank,the international electronic industry company elcoteq has laid off tens of employees from its tallinn facility contrary to earlier layoffs the company contracted the ranks of its office workers the daily postimees reported
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,financialphrasebank,with the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",financialphrasebank,according to the company s updated strategy for the years basware targets a longterm net sales growth in the range of with an operating profit margin of of net sales


In [19]:
from sklearn.base import TransformerMixin, BaseEstimator

class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self,n_jobs=-1):
        self.n_jobs = n_jobs
    
    def fit(self, X,y):
        return self
    
    def transform(self, X):
        lower_case_text       = X.apply(lambda x:x.lower())
        removed_punct_text    = lower_case_text.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        removed_numbers_text  = removed_punct_text.apply(lambda x: re.sub(" \d+", " ", x))
        clear_whitespace_text = removed_numbers_text.apply(lambda x: re.sub(' +', ' ', x.lstrip().rstrip()))
        return clear_whitespace_text

In [22]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(final_df.sentence, final_df.sentiment_label, 
                                                    test_size=0.2, random_state=42)

In [46]:
##creating model pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


preprocessor = TextPreprocessor()

vectorizer = CountVectorizer(analyzer      = 'word',
                             token_pattern = r'\S+',
                             stop_words    = 'english',
                             ngram_range   = (1,3),
                             binary        = True)
func = f_classif
selector = SelectKBest(func, k=1000)

svc_classifier = SVC( kernel       = 'rbf',
                      C            =  100,
                      random_state = 42)

lr_classfier   = LogisticRegression(random_state=42, max_iter=400)

pipe_lr = Pipeline([('prep', preprocessor),
                    ('vec', vectorizer),
                    ('sel', selector),
                    ('clf',lr_classfier)
                   ])

pipe_svm = Pipeline([('prep', preprocessor),
                    ('vec', vectorizer),
                    ('sel', selector),
                    ('clf',svc_classifier)
                   ])


In [89]:
pwd

'C:\\Users\\musta\\Github\\Stanford_Assignments\\cs224u'

In [52]:
grid_params_lr = [{'clf__gamma': ['l1', 'l2'],
                   'clf__C': [1.0, 0.5, 0.1],
                   'clf__solver': ['liblinear'],
                  
                   'sel__k':[1000, 5000, 10000, 20000, 40000],
                  
                   'sel__score_func':[f_classif, chi2],
                   
                   'vec__ngram_range':[(1,3),(2,3),(1,2)],
                   'vec__binary':[True, False]}] 

grid_params_svm = [{'clf__gamma': ['scale', 'auto'],
                   'clf__C': [100, 10, 1.0, 0.1, 0.01],
                  
                   'sel__k':[1000, 5000, 10000, 20000, 40000],
                  
                   'sel__score_func':[chi2],
                   
                   'vec__ngram_range':[(1,3),(2,3),(1,2)],
                   'vec__binary':[True, False]}] 

In [53]:
LR_model = GridSearchCV(estimator  = pipe_lr,
                        param_grid = grid_params_lr,
                        scoring    = 'f1_macro',
                        cv         = 5,
                        n_jobs     = -1,
                        verbose    = 2) 

SVC_model = GridSearchCV(estimator  = pipe_svm,
                        param_grid = grid_params_svm,
                        scoring    = 'f1_macro',
                        cv         = 5,
                        n_jobs     = -1,
                        verbose    = 2) 

In [49]:
LR_model.fit(X_train, y_train)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits


 0.58578321 0.44153614 0.59855013 0.58569956 0.43861484 0.59585927
 0.60964535 0.4451729  0.60678337 0.60465123 0.4357776  0.60208646
 0.61000478 0.4432798  0.60668406 0.6069492  0.44057268 0.60406539
 0.61136627 0.449364   0.61191195 0.60772723 0.44963585 0.61140749
 0.61203186 0.44945112 0.61152703 0.60696173 0.44660854 0.6112579
 0.6150648  0.451079   0.6127835  0.61646375 0.44901566 0.61451727
 0.6150648  0.451079   0.6127835  0.61698982 0.44918767 0.61413331
 0.61662925 0.45333529        nan 0.61649562 0.45271026        nan
 0.61662925 0.45333529        nan 0.61619203 0.45329114        nan
 0.58582774 0.44521264 0.596145   0.58475617 0.44256739 0.59770793
 0.58576012 0.44521264 0.59632863 0.58118782 0.44328966 0.59713546
 0.58018    0.43735026 0.57917342 0.57980033 0.43527965 0.57941138
 0.57985828 0.43735026 0.5793971  0.58279368 0.43718529 0.58299502
 0.55474068 0.44627321 0.58188606 0.55905101 0.44879613 0.58211668
 0.55566444 0.4463889  0.58188606 0.55755561 0.44841745 0.57748

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep', TextPreprocessor()),
                                       ('vec',
                                        CountVectorizer(binary=True,
                                                        ngram_range=(1, 3),
                                                        stop_words='english',
                                                        token_pattern='\\S+')),
                                       ('sel', SelectKBest(k=1000)),
                                       ('clf',
                                        LogisticRegression(max_iter=400,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 0.5, 0.1],
                          'clf__penalty': ['l1', 'l2'],
                          'clf__solver': ['liblinear'],
                          'sel__k': [1000, 5000, 10000, 20000, 40000],
                          'sel__s

In [50]:
print('Best params are : %s' % LR_model.best_params_)
# Best training data accuracy
print('Best training f1_macro: %.3f' % LR_model.best_score_)
# Predict on test data with best params
y_pred = LR_model.predict(X_test)

Best params are : {'clf__C': 1.0, 'clf__penalty': 'l1', 'clf__solver': 'liblinear', 'sel__k': 20000, 'sel__score_func': <function chi2 at 0x0000026C1394F4C0>, 'vec__binary': False, 'vec__ngram_range': (1, 3)}
Best training f1_macro: 0.617


In [64]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.65      0.44      0.53       205
     neutral       0.72      0.83      0.77       713
    positive       0.67      0.60      0.63       378

    accuracy                           0.70      1296
   macro avg       0.68      0.62      0.64      1296
weighted avg       0.70      0.70      0.69      1296



In [75]:
best_lr_vectorizer = CountVectorizer(analyzer      = 'word',
                                     token_pattern = r'\S+',
                                     stop_words    = 'english',
                                     ngram_range   = (1,3),
                                     binary        = False)
best_lr_func = chi2
best_lr_selector = SelectKBest(func, k=20000)
best_lr_model = LogisticRegression(C=1.0, penalty='l1', solver='liblinear')

best_lr_pipe = Pipeline([('prep', preprocessor),
                    ('vec', best_lr_vectorizer),
                    ('sel', best_lr_selector),
                    ('clf',best_lr_model)
                   ])

In [76]:
best_lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('prep', TextPreprocessor()),
                ('vec',
                 CountVectorizer(ngram_range=(1, 3), stop_words='english',
                                 token_pattern='\\S+')),
                ('sel', SelectKBest(k=20000)),
                ('clf', LogisticRegression(penalty='l1', solver='liblinear'))])

In [77]:
y_pred_lr = best_lr_pipe.predict(X_test)

In [79]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

    negative       0.64      0.44      0.52       205
     neutral       0.72      0.83      0.77       713
    positive       0.66      0.59      0.63       378

    accuracy                           0.70      1296
   macro avg       0.68      0.62      0.64      1296
weighted avg       0.69      0.70      0.69      1296



In [80]:
import pickle
pickle.dump(best_lr_pipe, open('LR_model.pkl', 'wb'))


In [81]:
%%time
SVC_model.fit(X_train, y_train)


Fitting 5 folds for each of 300 candidates, totalling 1500 fits


 0.54041575 0.44690291 0.53394963 0.52914295 0.44827025 0.51961383
 0.51628592 0.45140654 0.53298371 0.50680402 0.44491202 0.52723015
 0.48471948 0.43644467 0.4972764  0.47654359 0.43317267 0.49907072
 0.4492967  0.42343758        nan 0.45942158 0.42461159        nan
 0.56231385 0.39941061 0.57683236 0.5589471  0.39972502 0.56752086
 0.45796535 0.34481934 0.4621086  0.45848251 0.34643303 0.46175982
 0.40181448 0.32119917 0.40852124 0.4009158  0.32735322 0.4045213
 0.34469108 0.28837529 0.32332403 0.35501575 0.28826474 0.33674523
 0.29465893 0.25481427        nan 0.30339969 0.2689945         nan
 0.57133706 0.44976561 0.58470375 0.55796079 0.4520655  0.57005902
 0.55214939 0.4473803  0.54792401 0.53306771 0.44837658 0.52969582
 0.52015928 0.45039241 0.53114965 0.51082819 0.44482477 0.5235905
 0.48280719 0.43589771 0.49810209 0.47325643 0.43331733 0.49883693
 0.44965222 0.42044493        nan 0.45690823 0.422565          nan
 0.38785384 0.31690829 0.39090783 0.38513042 0.32193015 0.389126

Wall time: 12min 34s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prep', TextPreprocessor()),
                                       ('vec',
                                        CountVectorizer(binary=True,
                                                        ngram_range=(1, 3),
                                                        stop_words='english',
                                                        token_pattern='\\S+')),
                                       ('sel', SelectKBest(k=1000)),
                                       ('clf', SVC(C=100, random_state=42))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [100, 10, 1.0, 0.1, 0.01],
                          'clf__gamma': ['scale', 'auto'],
                          'sel__k': [1000, 5000, 10000, 20000, 40000],
                          'sel__score_func': [<function chi2 at 0x0000026C1394F4C0>],
                          'vec__binary': [True, False],
                          'vec__ngram_range': [(1, 3), 

In [82]:
print('Best params are : %s' % SVC_model.best_params_)
# Best training data accuracy
print('Best training f1_macro: %.3f' % SVC_model.best_score_)
# Predict on test data with best params
y_pred = SVC_model.predict(X_test)

Best params are : {'clf__C': 10, 'clf__gamma': 'scale', 'sel__k': 1000, 'sel__score_func': <function chi2 at 0x0000026C1394F4C0>, 'vec__binary': True, 'vec__ngram_range': (1, 2)}
Best training f1_macro: 0.585


In [85]:
best_svc_vectorizer = CountVectorizer(analyzer      = 'word',
                                     token_pattern = r'\S+',
                                     stop_words    = 'english',
                                     ngram_range   = (1,2),
                                     binary        = True)
best_svc_func = chi2
best_svc_selector = SelectKBest(func, k=1000)
best_svc_model = SVC( kernel       = 'rbf',
                      C            =  10,
                      gamma        = 'scale',
                      random_state = 42)

best_svc_pipe = Pipeline([('prep', preprocessor),
                    ('vec', best_svc_vectorizer),
                    ('sel', best_svc_selector),
                    ('clf',best_svc_model)
                   ])

best_svc_pipe.fit(X_train, y_train)

Pipeline(steps=[('prep', TextPreprocessor()),
                ('vec',
                 CountVectorizer(binary=True, ngram_range=(1, 2),
                                 stop_words='english', token_pattern='\\S+')),
                ('sel', SelectKBest(k=1000)),
                ('clf', SVC(C=10, random_state=42))])

In [86]:
y_pred_svc = best_svc_pipe.predict(X_test)
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

    negative       0.57      0.47      0.51       205
     neutral       0.72      0.80      0.76       713
    positive       0.61      0.54      0.57       378

    accuracy                           0.67      1296
   macro avg       0.63      0.60      0.61      1296
weighted avg       0.66      0.67      0.66      1296



In [87]:
pickle.dump(best_svc_pipe, open('SVC_model.pkl', 'wb'))
