In [54]:
from collections import Counter
import csv
from datetime import datetime
import glob
import re
from pprint import pprint

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
import requests
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

In [55]:
AA_APIKEY = 'KTTAYRUUXWIVVFBM'

In [56]:
from nltk.corpus import cmudict
cmud = cmudict.dict()
#contador de silabas: https://stackoverflow.com/questions/405161/detecting-syllables-in-a-word
def nsyl(word):
    if word.lower() in cmud:
        return [len(list(y for y in x if y[-1].isdigit())) for x in cmud[word.lower()]] 
    return [0]

In [57]:
#cargo el diccionario financiero de Loughran Mc Donald
lm_masterdic = {}
with open('corpus/LoughranMcDonald_MasterDictionary_2014.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lm_masterdic[row['Word'].lower()] = row

In [58]:
lm_masterdic['challenges']

OrderedDict([('Word', 'CHALLENGES'),
             ('Sequence Number', '11487'),
             ('Word Count', '310301'),
             ('Word Proportion', '2.18E-05'),
             ('Average Proportion', '1.54E-05'),
             ('Std Dev', '5.59E-05'),
             ('Doc Count', '141958'),
             ('Negative', '2009'),
             ('Positive', '0'),
             ('Uncertainty', '0'),
             ('Litigious', '0'),
             ('Constraining', '0'),
             ('Superfluous', '0'),
             ('Interesting', '0'),
             ('Modal', '0'),
             ('Irr_Verb', '0'),
             ('Harvard_IV', '2'),
             ('Syllables', '3'),
             ('Source', '12of12inf')])

In [59]:
#cargo todas las earning calls
ecalls = glob.glob('data/*.txt')

In [60]:
#expresion regular para solo obtener palabras e ignorar numeros y puntuacion
tokenizer = RegexpTokenizer(r'\w+')

In [61]:
ecalls_data = []
negative_words_text = ''
positive_words_text = ''
for ec in ecalls:
    ticker = re.search(r'\((?P<ticker>\w+)\)', ec).groups()[0]
    quarter = re.search(r'Q(?P<quarter>\d{1}\s\d{4})', ec).groups()[0].replace(' ','')
    p_fname = ticker + '-' + quarter
    ec_text = open(ec).read()
    #limpiar seccion Question-and-Answer Session
    ec_text = ec_text[:ec_text.find('Question-and-Answer Session')]
    ecall_no_punc = tokenizer.tokenize(ec_text)
    #acumulo el total de palabras
    total_words = [w for w in ecall_no_punc]
    #limpio palabras comunes
    cleaned_ecall = [w.title() for w in ecall_no_punc if w.lower() not in stopwords.words('english')]
    total_tokens = len(cleaned_ecall)
    word_count = Counter(cleaned_ecall)
    word_sentiment = {}
    positive_words = 0
    negative_words = 0
    complex_words = 0
    for word in word_count:
        word_proportion = word_count[word] / float(total_tokens)
        wl = word.lower()
        if wl in lm_masterdic:
            if int(lm_masterdic[wl]['Negative']) > 0 and wl not in ['questions', '0']:
                word_sentiment[wl] = word_proportion
                negative_words += 1
                negative_words_text += ' ' + wl
                
            elif int(lm_masterdic[wl]['Positive']) > 0:
                positive_words += 1
                positive_words_text += ' ' + wl

        if nsyl(word)[0] >= 3:
            complex_words += 1
                
    #negative_proportion = sum([e for e in word_sentiment.values()])
    total_count = sum(word_count.values())
    #sentimiento del texto, mientras mas alto sea el numero mejor
    sentiment = (positive_words / float(negative_words)) / total_count
    #proporcion de palabras negativas, mientras mas alto es el numero peor
    negative_proportion = float(negative_words) / total_count
    
    #calculando gunning fog index -> https://en.wikipedia.org/wiki/Gunning_fog_index
    sentence_tokenize_list = sent_tokenize(ec_text)
    sentence_count = len(sentence_tokenize_list)
    avg_sentence_length = len(total_words) / sentence_count
    pct_complex_words = complex_words / float(len(total_words)) * 100
    #metrica de readabily, cuanto mas alto peor
    gunning_fog = (avg_sentence_length + pct_complex_words) * 0.4
    
    data = {
        'fname': p_fname,
        'text': ec_text,
        'negative_proportion': negative_proportion,
        'sentiment': sentiment,
        'gunning_fog': gunning_fog
    }
                                      
    ecalls_data.append(data)

In [62]:
neg_rank = sorted(ecalls_data, key=lambda x: x['negative_proportion'], reverse=True)

In [63]:
sentiment_rank = sorted(ecalls_data, key=lambda x: x['sentiment'])

In [64]:
gunning_fog_rank = sorted(ecalls_data, key=lambda x: x['gunning_fog'])

In [65]:
def get_quarter(date_str):
    if date_str.find('-03') != -1:
        return date_str.split('-')[0] + '-Q1'
    elif date_str.find('-06') != -1:
        return date_str.split('-')[0] + '-Q2'
    elif date_str.find('-09') != -1:
        return date_str.split('-')[0] + '-Q3'
    elif date_str.find('-12') != -1:
        return date_str.split('-')[0] + '-Q4'

In [73]:
for ticker in ["AMZN","EBAY","MELI"]:
    
    t_neg_rank = [nr['fname'] + '->' + str(nr['negative_proportion']) for nr in neg_rank if nr['fname'][:4] == ticker]
    t_sent_rank = [nr['fname'] + '->' + str(nr['sentiment']) for nr in sentiment_rank if nr['fname'][:4] == ticker]
    t_gunning_fog_index = [nr['fname'] + '->' + str(nr['gunning_fog']) for nr in gunning_fog_rank if nr['fname'][:4] == ticker]
    
    data = requests.get('https://www.alphavantage.co/query?function=TIME_SERIES_MONTHLY&symbol=%s&apikey=%s' % (ticker, AA_APIKEY))
    monthly_data = data.json()['Monthly Time Series']
    
    q_performance = sorted([(get_quarter(k[:-3]),float(v['4. close'])) for k,v in monthly_data.items() if datetime.strptime(k, "%Y-%m-%d").year >= 2014 and  datetime.strptime(k, "%Y-%m-%d").month % 3 == 0])
    ticker_perf = []
    for idx, e in enumerate(q_performance):
        if idx == 0:
            pass
        else:
            perc_diff = ((e[1]-q_performance[idx-1][1]) / q_performance[idx-1][1]) * 100
            ticker_perf.append([e[0], perc_diff])
    
    #ignoro 2014 y 2018 en perc_diff
    
    print('**** Sentiment Rank ' + ticker + '**** \n')
    pprint(list(zip(t_sent_rank,sorted(ticker_perf[3:-1], key=lambda x:x[1]))))
    
    print('**** Negative Rank ' + ticker + '**** \n')
    pprint(list(zip(t_neg_rank,sorted(ticker_perf[3:-1], key=lambda x:x[1]))))
    
    print('**** Gunning Fog Index ' + ticker + '**** \n')
    pprint(list(zip(t_gunning_fog_index,sorted(ticker_perf[3:-1], key=lambda x:x[1]))))

**** Sentiment RankAMZN**** 

[('AMZN-42016->0.0008291873963515754', ['2016-Q1', -12.169139948808239]),
 ('AMZN-22017->0.000992063492063492', ['2016-Q4', -10.442966165458426]),
 ('AMZN-32016->0.001001001001001001', ['2017-Q3', -0.6869834710743778]),
 ('AMZN-32017->0.001019367991845056', ['2017-Q2', 9.188530692354552]),
 ('AMZN-22015->0.0010498687664041995', ['2015-Q2', 16.65950013437247]),
 ('AMZN-32015->0.0010830324909747292', ['2016-Q3', 17.004834968279244]),
 ('AMZN-22016->0.001085383502170767', ['2015-Q3', 17.92255062314267]),
 ('AMZN-12015->0.0010903426791277258', ['2017-Q1', 18.225825809807027]),
 ('AMZN-12017->0.0011086474501108647', ['2015-Q1', 19.896890607378765]),
 ('AMZN-12016->0.001147227533460803', ['2016-Q2', 20.54780675156661]),
 ('AMZN-42015->0.0012919896640826874', ['2017-Q4', 21.648723149737346]),
 ('AMZN-42017->0.00186219739292365', ['2015-Q4', 32.03813319267812])]
**** Negative RankAMZN**** 

[('AMZN-32017->0.009174311926605505', ['2016-Q1', -12.169139948808239]),
 