# Sentimential Scoring Model

### Loading Previous Package + Reading in the Model Fitted

In [1]:
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import string
#import spacy
import re
import multiprocessing as mp

In [2]:
from sklearn.base import TransformerMixin, BaseEstimator

from Shallow_ML_Models.DataPreprocessor import TextPreprocessor
from Shallow_ML_Models.DataPreprocessor import TextPreprocessor_withStem

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
import pickle

In [5]:
lr_model = pickle.load(open(r'Shallow_ML_Models/LR_model_withStem.pkl', 'rb'))
#lstm_model = pickle.load(open(r'LSTM_Model/LSTM_model.pkl', 'rb'))



### Creating Table for Sentiment Score

In [6]:
year12 = []
period12 = []
i = 2013
for j in range(3, 12):
    year12.append(i)
    period12.append(j+1)
for i in range(2014, 2021):
    for j in range(0, 12):
        year12.append(i)
        period12.append(j+1)
i = 2021
for j in range(0, 9):
    year12.append(i)
    period12.append(j+1)

In [7]:
column_name = ['positive', 'neutral', 'negative', 'exception']
sentiment_score_lr = pd.DataFrame(np.zeros((len(year12), len(column_name))), index = [year12, period12], columns = column_name).reset_index().rename(columns={"level_0": "year", "level_1": "month"}).set_index(['year','month'])
#sentiment_score_lstm = pd.DataFrame(np.zeros((len(year12), len(column_name))), index = [year12, period12], columns = column_name).reset_index().rename(columns={"level_0": "year", "level_1": "month"}).set_index(['year','month'])

### Reading the List of files + Reading in the Text + Sentiment Scoring

In [8]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from bs4 import BeautifulSoup
import sys, time

In [None]:
gdelt_list = pd.read_csv(r"Data/GDELT/gdelt.csv", encoding= 'unicode_escape').set_index('filename')
gdelt_colName = pd.read_csv(r"Data/GDELT/CSV.header.dailyupdates.txt", sep='\t', header=None).iloc[0].to_list()
start_time = time.time()

for gdelt_list_counter in range(0, len(gdelt_list)):
    
    zip_file_url = urlopen(gdelt_list.iloc[gdelt_list_counter]['hyperlink'])
    zip_file = ZipFile(BytesIO(zip_file_url.read()))
    document_list = pd.read_csv(zip_file.open(zip_file.namelist()[0]), sep ='\t', header=None, names=gdelt_colName).set_index(['GLOBALEVENTID'])
    US_document_list = document_list[(document_list['Actor1Geo_ADM1Code'] == "US") & (document_list['Actor2Geo_ADM1Code'] == "US") & (document_list['ActionGeo_ADM1Code'] == "US") & ((document_list['Actor1Code'] == "USA") | (document_list['Actor2Code'] == "USA"))]
    
    for doc_count in range(0,len(US_document_list)):
        
        if (doc_count % 500 == 0):
            news_year = int(str(US_document_list.iloc[doc_count, 0])[0:4])
            news_month = int(str(US_document_list.iloc[doc_count, 0])[4:6])
            news_date = int(str(US_document_list.iloc[doc_count, 0])[6:8])

            news_url = document_list.iloc[doc_count, -1]
            try:
                soup = BeautifulSoup(urlopen(news_url).read(), features="html.parser")
                for script in soup(["script", "style"]):    # kill all script and style elements
                    script.decompose()
                text = soup.get_text()    # get text
                lines = (line.strip() for line in text.splitlines())    # break into lines and remove leading and trailing space on each
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))    # break multi-headlines into a line each
                text = '\n'.join(chunk for chunk in chunks if chunk)    # drop blank lines
                sentiment_lr = lr_model.predict(pd.Series(text))[0]
                #sentiment_lstm = lstm_model.predict(pd.Series(text))[0]
            except:
                sentiment_lr = 'exception'
                #sentiment_lstm = 'exception'

            try:
                sentiment_score_lr.loc[(news_year, news_month), sentiment_lr] += 1
                #sentiment_score_lstm.loc[(news_year, news_month), sentiment_lstm] += 1
            except:
                print('Error!')
        
        current_time = time.time()
        sys.stdout.write("\rgdelt_list_counter = %s / %s ; doc_count = %s / %s ; time elasped: %s minutes." % (gdelt_list_counter + 1, len(gdelt_list), doc_count + 1, len(US_document_list), round((current_time-start_time)/60,2)))
        sys.stdout.flush()
    
    print("\rGEDLT progress: %s / %s ; Time Elasped: %s minutes." % (gdelt_list_counter + 1, len(gdelt_list), round((current_time-start_time)/60,2)))
    sentiment_score_lr.to_csv(r"Data/sentiment_score_lr.csv")
    #sentiment_score_lstm.to_csv(r"Data/sentiment_score_lstm.csv")

In [9]:
sentiment_score_lr.to_csv(r"Data/sentiment_score_lr.csv")
#sentiment_score_lstm.to_csv(r"Data/sentiment_score_lstm.csv")