# Sentimential Scoring Model

### Loading Previous Package + Reading in the Model Fitted

In [1]:
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import string
#import spacy
import re
import multiprocessing as mp

In [2]:
from sklearn.base import TransformerMixin, BaseEstimator

class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self,n_jobs=-1):
        self.n_jobs = n_jobs
    
    def fit(self, X,y):
        return self
    
    def transform(self, X):
        lower_case_text       = X.apply(lambda x:x.lower())
        removed_punct_text    = lower_case_text.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        removed_numbers_text  = removed_punct_text.apply(lambda x: re.sub(" \d+", " ", x))
        clear_whitespace_text = removed_numbers_text.apply(lambda x: re.sub(' +', ' ', x.lstrip().rstrip()))
        return clear_whitespace_text

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
import pickle

In [5]:
filename = 'LR_model.pkl'
lr_model = pickle.load(open(filename, 'rb'))



### Creating Table for Sentiment Score

In [6]:
year12 = []
period12 = []
for i in range(1979, 2021):
    for j in range(0, 12):
        year12.append(i)
        period12.append(j+1)

In [7]:
column_name = ['positive', 'neutral', 'negative', 'exception']
sentiment_score = pd.DataFrame(np.zeros((len(year12), len(column_name))), index = [year12, period12], columns = column_name)
sentiment_score = sentiment_score.reset_index().rename(columns={"level_0": "year", "level_1": "month"}).set_index(['year','month'])

### Reading the List of files + Reading in the Text + Sentiment Scoring

In [8]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from bs4 import BeautifulSoup

import sys

In [None]:
gdelt_list = pd.read_csv(r"Data\GDELT\gdelt.csv").set_index('filename')

for gdelt_list_counter in range(0, len(gdelt_list)):
    
    zip_file_url = urlopen(gdelt_list.iloc[gdelt_list_counter]['hyperlink'])
    zip_file = ZipFile(BytesIO(zip_file_url.read()))
    document_list = pd.read_csv(zip_file.open(zip_file.namelist()[0]), sep ='\t', header=None).set_index(0)
    
    for doc_count in range(0,len(document_list)):
        
        news_year = int(str(document_list.iloc[doc_count, 0])[0:4])
        news_month = int(str(document_list.iloc[doc_count, 0])[4:6])
        news_date = int(str(document_list.iloc[doc_count, 0])[6:8])

        news_url = document_list.iloc[doc_count, -1]
        try:
            soup = BeautifulSoup(urlopen(news_url).read(), features="html.parser")
            for script in soup(["script", "style"]):    # kill all script and style elements
                script.extract()
            text = soup.get_text()    # get text
            lines = (line.strip() for line in text.splitlines())    # break into lines and remove leading and trailing space on each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))    # break multi-headlines into a line each
            text = '\n'.join(chunk for chunk in chunks if chunk)    # drop blank lines
            sentiment = lr_model.predict(pd.Series(text))[0]
        except:
            sentiment = 'exception'

        try:
            sentiment_score.loc[(news_year, news_month), sentiment] += 1
        except:
            print('Error!')
        
        sys.stdout.write("\rgdelt_list_counter = %s / %s ; doc_count = %s / %s" % (gdelt_list_counter + 1, len(gdelt_list), doc_count + 1, len(document_list)))
        sys.stdout.flush()
    
    if gdelt_list_counter % 30 == 0:
        print("Save csv....\n")
        sentiment_score.to_csv(r"Data\sentiment_score.csv")

gdelt_list_counter = 1 / 2942 ; doc_count = 11 / 100226Save csv....

gdelt_list_counter = 2 / 2942 ; doc_count = 11 / 107131

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 7 / 2942 ; doc_count = 7 / 607332

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 10 / 2942 ; doc_count = 13 / 122642

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 11 / 2942 ; doc_count = 12 / 115278

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 13 / 2942 ; doc_count = 8 / 79812

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 31 / 2942 ; doc_count = 14 / 130311Save csv....

gdelt_list_counter = 43 / 2942 ; doc_count = 6 / 1532040

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 43 / 2942 ; doc_count = 7 / 153204

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 43 / 2942 ; doc_count = 13 / 153204

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 43 / 2942 ; doc_count = 14 / 153204

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 43 / 2942 ; doc_count = 15 / 153204

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 47 / 2942 ; doc_count = 3 / 2308706

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 49 / 2942 ; doc_count = 2 / 19998

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


gdelt_list_counter = 51 / 2942 ; doc_count = 3 / 72614

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 61 / 2942 ; doc_count = 5 / 460504Save csv....

gdelt_list_counter = 67 / 2942 ; doc_count = 1 / 8705963

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 67 / 2942 ; doc_count = 2 / 87059

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 67 / 2942 ; doc_count = 3 / 87059

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 67 / 2942 ; doc_count = 4 / 87059

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 67 / 2942 ; doc_count = 5 / 87059

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


gdelt_list_counter = 80 / 2942 ; doc_count = 11 / 104586

In [None]:
sentiment_score.to_csv(r"Data\sentiment_score.csv")