# Model - Dictionary

In [1]:
NAME = '03-01_model_dictionaries'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports  

In [2]:
import os
import re
import pickle
import numpy as np
import pandas as pd

### Settings

In [3]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code
## IV-4 Harvard

In [14]:
# Read general inquirer spreadsheet
gi = pd.read_csv('http://www.wjh.harvard.edu/~inquirer/inqtabs.txt', sep='\t', usecols=['Entry', 'Source', 'Positiv', 'Negativ'])

iv4 = (gi.copy()
         .loc[gi['Source'].isin(['H4Lvd', 'H4'])]
         .assign(Entry=lambda x: x['Entry'].str.split('#').str[0].str.lower()
         .drop_duplicates()))

iv4_positive = list(iv4.loc[iv4['Positiv'] == 'Positiv', 'Entry'].dropna().unique())
iv4_negative = list(iv4.loc[iv4['Negativ'] == 'Negativ', 'Entry'].dropna().unique())

print(f"# Positive: {len(iv4_positive)}\n# Negative: {len(iv4_negative)}")

# Positive: 1563
# Negative: 1892


## Loughran & McDonald (2011)

In [15]:
lm__dictionary = pd.read_csv(os.path.join('0_data', 'lm_dictionary', 'LoughranMcDonald_SentimentWordLists_2018.csv'))
lm_positive = list(lm__dictionary['Positive'].str.lower().dropna().unique())
lm_negative = list(lm__dictionary['Negative'].str.lower().dropna().unique())

print(f"# Positive: {len(lm_positive)}\n# Negative: {len(lm_negative)}")

# Positive: 354
# Negative: 2355


## Save

In [16]:
sentiment_lists = [iv4_positive, iv4_negative, lm_positive, lm_negative]
file_names = ['iv4_positive', 'iv4_negative', 'lm_positive', 'lm_negative']

for sentiment_list, file_name in zip(sentiment_lists, file_names):
    with open(os.path.join(pipeline, 'out', f'{file_name}.pickle'), 'wb') as f:
        pickle.dump(sentiment_list, f)