In [1]:
# Sentiment Analysis Packages
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import re
import string
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

import individual_query
import webscraper_confidence_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nelly.loh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Sentiment Analysis
def sentiment_model(test_query):
    
    if len(test_query) == 0:
        sys.exit("No Articles found.")
        
    else:

        # load json and create model
        from keras.models import model_from_json
        json_file = open('bi_lstm_model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        reconstructed_model_bi_lstm = model_from_json(loaded_model_json)

        # load weights into new model
        reconstructed_model_bi_lstm.load_weights("bi_lstm_model.h5")

        # load json and create model
        from keras.models import model_from_json
        json_file = open('lstm_model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        reconstructed_model_lstm = model_from_json(loaded_model_json)

        url = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)
        (?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([
          ^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''

        tokenizer = RegexpTokenizer(r'\w+')

        def clean_data(temp):
            temp = temp.map(lambda x:str(x).lower()) 
            # removing emails
            temp = temp.map(lambda x:re.sub(r"\b[^\s]+@[^\s]+[.][^\s]+\b", "", x)) 
            # removing url
            temp = temp.map(lambda x:re.sub(url, "", x)) 
            # removing numbers
            temp = temp.map(lambda x:re.sub(r'[^a-zA-z.,!?/:;\"\'\s]', "", x)) 
            # removing white space
            temp = temp.map(lambda x:re.sub(r'^\s*|\s\s*', ' ', x).strip()) 
            # removing punctuations
            temp = temp.map(lambda x:''.join([c for c in x if c not in string.punctuation])) 
            # removing special characters
            temp = temp.map(lambda x:re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', x)) 
            # unicode
            temp = temp.map(lambda x:unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')) 
            # tokenising text for cleaning
            temp = temp.map(lambda x:tokenizer.tokenize(x)) 
            # removing stop words
            temp = temp.map(lambda x:[i for i in x if i not in stopwords.words('english')]) 
            temp = temp.map(lambda x:' '.join(x))
            return temp

        test_query['body'] = test_query['text']
        test_query.text = clean_data(test_query.text)

        # Data Preprocessing for model ingestion
        maxlen = 50
        embedding_dim = 100

        X = test_query.text.values
        tokenizer = Tokenizer(num_words=5000)
        tokenizer.fit_on_texts(test_query.text.values)
        X = tokenizer.texts_to_sequences(X)
        vocab_size = len(tokenizer.word_index) + 1
        test_input = pad_sequences(X, padding='pre', maxlen=maxlen)

        # Predicting output

        # LSTM
        test_lstm = reconstructed_model_lstm.predict(test_input)
        test_classes_lstm = np.argmax(test_lstm,axis=1)
        test_query['prediction_lstm'] = test_classes_lstm

        # BI-LSTM
        test_bi_lstm = reconstructed_model_bi_lstm.predict(test_input)
        test_classes_bi_lstm = np.argmax(test_bi_lstm,axis=1)
        test_query['prediction_bi_lstm'] = test_classes_bi_lstm

        test_query.loc[test_query['prediction_lstm'] == 0, 'sentiment_lstm'] = 'Financial Crime'
        test_query.loc[test_query['prediction_lstm'] == 1, 'sentiment_lstm'] = 'Serious Crime'
        test_query.loc[test_query['prediction_lstm'] == 2, 'sentiment_lstm'] = 'General News (Positive)'
        test_query.loc[test_query['prediction_lstm'] == 3, 'sentiment_lstm'] = 'General News (Neutral)'

        test_query.loc[test_query['prediction_bi_lstm'] == 0, 'sentiment_bi_lstm'] = 'Financial Crime'
        test_query.loc[test_query['prediction_bi_lstm'] == 1, 'sentiment_bi_lstm'] = 'Serious Crime'
        test_query.loc[test_query['prediction_bi_lstm'] == 2, 'sentiment_bi_lstm'] = 'General News (Positive)'
        test_query.loc[test_query['prediction_bi_lstm'] == 3, 'sentiment_bi_lstm'] = 'General News (Neutral)'
        
        test_query = test_query[['title', 'time', 'year_of_birth', 'description', 'link', 'body',
                                       'names_list', 'confidence_score', 'sentiment_lstm', 'sentiment_bi_lstm']]
    
        return pd.DataFrame(test_query)


In [3]:
individual_dict = individual_query.preprocess_input_to_dict('Ng Yu Zhi', 'Singaporean', 'Male')

In [4]:
output = webscraper_confidence_score.search_articles_on_individual(individual_dict, no_of_articles=10, additional_keywords=None)

0.9440131622924022
0.9746937812320862
0.9746937812320862
0.9662583749761149
0.9493875624641723
0.9493875624641723
1.0
0.9313600529084453
0.9746937812320862
0.9187069435244883
0.9493875624641723
number of unsatisfactory rows is 6/11


In [5]:
sentiment_model(output)

Unnamed: 0,title,time,year_of_birth,description,link,body,names_list,confidence_score,sentiment_lstm,sentiment_bi_lstm
0,Trio charged over plans to help businessman Ng...,2 days ago,0,The alleged plans were foiled after the police...,https://www.straitstimes.com/singapore/courts-...,SINGAPORE - Three men appeared in a district c...,"{'Ng Yu Zhi': 2, 'Guan Wei': 1, 'Alvin Oey Wei...",0.944013,General News (Positive),Financial Crime
1,"Ng Yu Zhi back in court, faces 18 new charges ...",3 days ago,0,"Ng, 34, now faces a total of 69 charges, inclu...",https://www.straitstimes.com/business/invest/e...,"SINGAPORE - Businessman Ng Yu Zhi, who is embr...","{'Ng Yu Zhi': 1, 'khaki trousers': 1, 'Kevin Y...",0.974694,General News (Neutral),General News (Positive)
2,20 more cheating charges for businessman Ng Yu...,1 month ago,0,This takes the total number of charges against...,https://www.straitstimes.com/singapore/courts-...,SINGAPORE - Singaporean businessman Ng Yu Zhi ...,"{'Ng Yu Zhi': 1, 'Finian Tan': 1, 'Thio Shen Y...",0.974694,Financial Crime,Financial Crime
3,"S'pore businessman Ng Yu Zhi, linked to allege...",4 months ago,0,The alleged victims named in the new charges i...,https://www.straitstimes.com/singapore/courts-...,SINGAPORE - Several notable names in the legal...,"{'Ng Yu Zhi': 1, 'Sunil Sudheesan': 1, 'Ms Pek...",0.966258,General News (Positive),General News (Positive)
4,Singapore businessman Ng Yu Zhi linked to alle...,6 months ago,0,"Ng Yu Zhi, 33, had previously been charged wit...",https://www.straitstimes.com/singapore/courts-...,SINGAPORE - A businessman is facing more charg...,{'Ng Yu Zhi': 1},0.949388,Financial Crime,Financial Crime
5,Singapore businessman linked to alleged fraud ...,7 months ago,0,"Ng Yu Zhi, 33, is the director of two firms an...",https://www.straitstimes.com/singapore/courts-...,SINGAPORE - A businessman charged on Monday (M...,{'Ng Yu Zhi': 1},0.949388,Financial Crime,Financial Crime
6,"S'pore businessman Ng Yu Zhi, linked to allege...",6 months ago,0,"Ng Yu Zhi, 34, is accused of fabricating two e...",https://www.straitstimes.com/singapore/courts-...,SINGAPORE - A Singaporean businessman linked t...,{'Ng Yu Zhi': 1},1.0,Financial Crime,Financial Crime
7,Vickers Venture Partners ensnared by alleged b...,5 months ago,0,"... Ng Yu Zhi, has been charged with a range o...",https://www.straitstimes.com/business/companie...,SINGAPORE (BLOOMBERG) - Technology start-up in...,"{'Ng Yu Zhi': 1, 'Bloomberg': 1, 'Finian Tan':...",0.93136,Serious Crime,Financial Crime
8,Singapore businessman Ng Yu Zhi linked to alle...,6 months ago,0,"Ng Yu Zhi, 34, is accused of fabricating two e...",https://www.thestar.com.my/aseanplus/aseanplus...,SINGAPORE (The Straits Times/ANN): A Singapore...,{'Ng Yu Zhi': 1},0.974694,Serious Crime,Financial Crime
9,Exclusive: KPMG managers back liquidation of S...,5 months ago,0,Authorities in the city state have implicated ...,https://sg.news.yahoo.com/exclusive-kpmg-manag...,"Ng Yu Zhi, a director of Envy Global Trading, ...","{'Ng Yu Zhi': 2, 'Chen Lin': 1, 'Bob Yap': 1, ...",0.918707,Serious Crime,Financial Crime
