# Import Relevant Libraries

In [7]:
import pandas as pd 
import numpy as np  
from   datetime import datetime
import glob
import pysentiment2 as ps
import nltk
from   nltk.sentiment.vader import SentimentIntensityAnalyzer
from   IPython.core.display import HTML
from   os import path

nltk.download('vader_lexicon')
lm    = ps.LM()
hiv4  = ps.HIV4()
vader = SentimentIntensityAnalyzer()

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

finbert_tokenizer  = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model      = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
finbert_classifier = pipeline("sentiment-analysis", model = finbert_model, tokenizer = finbert_tokenizer)

HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/maya/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Sentiment Analysis

In [8]:
def sentimentAnalysisLMD(text):
    tokens = lm.tokenize(text) #tokenize the text into relevant words in the LM dict
    score = lm.get_score(tokens) #score dict seperates into: NEG, POS, POLARITY, SUBJECTIVITY
    return score

def sentimentAnalysisHIV4(text):
    tokens = hiv4.tokenize(text)
    score = hiv4.get_score(tokens)
    return score

def sentimentAnalysisVader(df):
  df_vader = pd.DataFrame(df, columns=['Date', 'Text'])
  scores = df_vader['Text'].apply(vader.polarity_scores).tolist()
  scores_vader = pd.DataFrame(scores).add_prefix("vader_")
  df_vader = df_vader.join(scores_vader, rsuffix='_right')
  return df_vader['vader_compound']
  
def sentimentAnalysisFinBert(df):
  df_finbert = pd.DataFrame(df, columns=['Date', 'Text'])
  score = []
  for i in range(len(df_finbert)):
    classified = finbert_classifier(df_finbert['Text'][i], truncation=True)[0]
    if classified['label'] == "negative":
        score.append(classified['score']*(-1))
    elif classified['label'] == "positive":
        score.append(classified['score'])
    else:
        score.append(0)
  return score

In [9]:
lst_companies = [] 
lst_files = [] 

Path = "*.csv"
count = 0

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    if not path.isfile(r'/Users/maya/OneDrive - Imperial College London/EE4/FYP/Final-Year-Project-main/TMF Sentiment/{0}'.format(file)):
        print(file)
        df = pd.read_csv(file)
        df['Text'] = df['Text'].astype(str)
        df = df.drop_duplicates(['Date','Headline'], keep='last')
        df = df.drop(['Unnamed: 0', 'Headline'], axis = 1)
        df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date 
        df.reset_index(drop=True, inplace=True)

        if df.shape[0] != 0:
            #LMD + HIV4
            df_LMD_HIV4 = pd.DataFrame(df, columns=['Date', 'Text'])
            df_LMD_HIV4["scoreLMD"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisLMD)
            df_LMD_HIV4["scoreHIV4"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisHIV4)
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreLMD"], axis=1), df_LMD_HIV4["scoreLMD"].apply(pd.Series).add_prefix("LMD_")], axis=1)
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreHIV4"], axis=1), df_LMD_HIV4["scoreHIV4"].apply(pd.Series).add_prefix("HIV4_")], axis=1)
            df[['LMD_Polarity', 'HIV4_Polarity']] = df_LMD_HIV4[['LMD_Polarity', 'HIV4_Polarity']]

            #VADER
            df['Vader_Polarity'] = sentimentAnalysisVader(df)

            #FINBERT
            df['FinBert_Polarity'] = sentimentAnalysisFinBert(df)

            # Find the average Polarity for each Dictionary, per given date 
            df = df.groupby('Date').mean().reset_index()
            df = df.sort_values(by = ['Date'], ascending = True)
            idx = pd.date_range('2015-02-03', '2021-12-31')
            df = df.set_index(['Date']).reindex(idx, fill_value=np.nan).rename_axis('Date').reset_index()

            company_name = file[0:len(file)-4].strip()
            lst_companies.append(company_name)
            df['Company'] = company_name
            df= df[['Date', 'Company', 'LMD_Polarity', 'HIV4_Polarity', 'Vader_Polarity', 'FinBert_Polarity']]

            count = count + 1
#             print(company_name)
            print(count)
            df.to_csv(r'/Users/maya/OneDrive - Imperial College London/EE4/FYP/Final-Year-Project-main/TMF Sentiment/{0}.csv'.format(company_name))

Ameren Corporation.csv
American Assets Trust.csv
Aptiv Plc.csv
Archer Daniels Midland.csv
Barry Callebaut AG R.csv
Boral Limited.csv
Danone SA.csv
Deutsche Telekom AG .csv
nVent Electric Plc.csv
