### Import

In [1]:
import pandas as pd
import numpy as np
import pickle

In [25]:
def clean_text(text):
    """
    Applies some pre-processing to clean text data.
    
    In particular:
    - lowers the string
    - removes the character [']
    - replaces punctuation characters with spaces

    """
    
    text = text.lower()

    text = re.sub(r"\'", "", text)  # remove the character [']

    # removing the punctuation
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    split = " "

    if isinstance(text, str):
        translate_map = dict((ord(c), str(split)) for c in filters)
        text = text.translate(translate_map)
    elif len(split) == 1:
        translate_map = maketrans(filters, split * len(filters))
        text = text.translate(translate_map)
    else:
        for c in filters:
            text = text.replace(c, split)
    return text

### Load the json data

In [3]:
list_df = []
for i in range (10):
    list_df.append(pd.read_json(f'../cluster_data/0{i}.json', lines=True))
for i in (np.arange(10, 60, 1)):
    list_df.append(pd.read_json(f'../cluster_data/{i}.json', lines=True))

df = pd.concat(list_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  import sys


### Create a dictionary containing each tweet with the tag 

In [4]:
list_hashtags = []
dict_tot = dict()
for i in range(df.text.index.max()):
    dict_tmp = dict()
    if (type(df.text[i])==str):
        if (len(df.entities[i]['hashtags'])!=0):
            dict_tmp['hashtags'] = df.entities[i]['hashtags'][0]['text']
            dict_tmp['text'] = df.text[i]
        else :
            dict_tmp['hashtags'] = df.entities[i]['hashtags']
            dict_tmp['text'] = df.text[i]
        dict_tot[(i,-1)]=dict_tmp
    elif type(df.text[i])==pd.core.series.Series: 
        for j in range(len(df.text[i])): 
            dict_tmp = dict()
            if (type(df.text[i].reset_index().text[j])==str):
                if len(df.entities[i].reset_index().entities[j]['hashtags'])!=0:
                    dict_tmp['hashtags'] = df.entities[i].reset_index().entities[j]['hashtags'][0]['text']
                    dict_tmp['text'] = df.text[i].reset_index()['text'][j]
                else : 
                    dict_tmp['hashtags'] = df.entities[i].reset_index().entities[j]['hashtags']
                    dict_tmp['text'] = df.text[i].reset_index()['text'][j]
                dict_tot[(i,j)]=dict_tmp
pickle.dump(dict_tot, open('dictionary_with_all_tweets_and_hashtags.p', 'wb'))

In [5]:
dict_tot = pickle.load(open('dictionary_with_all_tweets_and_hashtags.p', 'rb'))

### Load the list of investor and compagny we are working on 

In [14]:
compagny = pickle.load(open('compagny.p', 'rb'))
investor = pickle.load(open('investor.p', 'rb'))
# Clean the name : 
for i in range(len(compagny)):
    compagny[i] = clean_text(compagny[i]).split()[0]
for i in range(len(investor)):
    investor[i] = clean_text(investor[i]).split()[0]

In [15]:
print(len(compagny))
print(len(investor))

2506
239


In [7]:
# Hard code data and try on a small subset
compagny = compagny[0:5]
investor = investor[0:5]
dict_tot[(0,0)]['text'] = 'ALTABA INC is a really nice compagny'
dict_tot[(0,1)]['text'] = 'I am never disappointed with ALTABA INC'
dict_tot[(0,2)]['text'] = 'ALTABA INC have nice product'
dict_tot[(0,3)]['text'] = 'ALTABA INC <3 :)'
dict_tot[(0,4)]['text'] = 'good bro !! ALTABA INC '
dict_tot[(0,5)]['text'] = 'CATERPILLAR INC DEL will kill the world'
dict_tot[(0,6)]['text'] = 'CATERPILLAR INC DEL is really bad'
dict_tot[(0,7)]['text'] = 'I am really angry against CATERPILLAR INC DEL '
dict_tot[(0,8)]['text'] = 'Shit !!! CATERPILLAR INC DEL '
dict_tot[(0,9)]['text'] = 'CATERPILLAR INC DEL :( :( :('
dict_tot[(0,10)]['text'] = 'SPDR S&P 500 ETF TR sell some vegetables'
dict_tot[(0,11)]['text'] = 'SPDR S&P 500 ETF TR play football'
dict_tot[(0,12)]['text'] = 'SPDR S&P 500 ETF TR is located in lausanne'
dict_tot[(0,13)]['text'] = 'SPDR S&P 500 ETF TR is in another country'
dict_tot[(0,14)]['text'] = '__ __ SPDR S&P 500 ETF TR'

### Create a dictionary using the compagny or investor as key grouping all the tweets about it
We consider that the tweet concerned a compagny/investor is there is his name in the tweet or the hashtag

In [16]:
def tweet_per_category(list_of_names, dict_tot):
    dict_per_category = dict()

    for name in list_of_names :
        list_tweet = []
        for key in list(dict_tot.keys()) :
            if (str(dict_tot[key]['hashtags']).lower().find(name.lower()) != -1) | (dict_tot[key]['text'].lower().find(name.lower()) != -1):
                list_tweet.append(dict_tot[key]['text'])
        dict_per_category[name] = list_tweet
    return dict_per_category

In [17]:
dict_per_compagny = tweet_per_category(compagny, dict_tot)
pickle.dump(dict_per_compagny, open('dictionary_per_compagny_tweet.p', 'wb'))

In [18]:
dict_per_investor = tweet_per_category(investor, dict_tot)
pickle.dump(dict_per_investor, open('dictionary_per_investor_tweet.p', 'wb'))

In [27]:
dict_tot = pickle.load(open('dictionary_per_compagny_tweet.p', 'rb'))

In [28]:
dict_tot = pickle.load(open('dictionary_per_investor_tweet.p', 'rb'))

### Make sentiment analysis for each tweet for each compagny/investors :

#### Vader
This library takes into account :
- positive sentence example
- punctuation emphasis handled correctly (sentiment intensity adjusted)
- booster words handled correctly (sentiment intensity adjusted)
- emphasis for ALLCAPS handled
- combination of signals - VADER appropriately adjusts intensity
-  booster words & punctuation make this close to ceiling for score
- negation sentence example
- positive sentence
- negated negative sentence with contraction
- qualified positive sentence is handled correctly (intensity adjusted)
- mixed negation sentence
- negative slang with capitalization emphasis
- mixed sentiment example with slang and constrastive conjunction "but"
- emoticons handled
- emojis handled
- Capitalized negation

In [19]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [20]:
def vader_analysis(dict_per_):
    analyzer = SentimentIntensityAnalyzer()
    dict_score = dict()
    for key in list(dict_per_.keys()):
        neg, pos, neu, compound, tmp_dict = [], [], [], [], dict()
        for sentence in dict_per_[key]:
            vs = analyzer.polarity_scores(sentence)
            neg.append(vs['neg'])
            neu.append(vs['neu'])
            pos.append(vs['pos'])
            compound.append(vs['compound'])
        tmp_dict['neg'] = np.mean(neg)
        tmp_dict['pos'] = np.mean(pos)
        tmp_dict['neu'] = np.mean(neu)
        tmp_dict['compound'] = np.mean(compound)
        dict_score[key] = tmp_dict 
        
    return dict_score

In [21]:
output = vader_analysis(dict_per_compagny)
pickle.dump(output, open('dictionary_per_compagny_score_vader.p', 'wb'))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [31]:
df = pickle.load(open('dictionary_per_compagny_score_vader.p','rb'))
df

{'altaba': {'neg': 0.0,
  'pos': 0.39083333333333337,
  'neu': 0.6091666666666667,
  'compound': 0.41979999999999995},
 'spdr': {'neg': 0.0,
  'pos': 0.057199999999999994,
  'neu': 0.9428000000000001,
  'compound': 0.068},
 'caterpillar': {'neg': 0.41950000000000004,
  'pos': 0.0,
  'neu': 0.5804999999999999,
  'compound': -0.5545},
 'dowdupont': {'neg': nan, 'pos': nan, 'neu': nan, 'compound': nan},
 'invesco': {'neg': nan, 'pos': nan, 'neu': nan, 'compound': nan},
 'schulman': {'neg': nan, 'pos': nan, 'neu': nan, 'compound': nan},
 'oasis': {'neg': 0.05925,
  'pos': 0.0375,
  'neu': 0.90325,
  'compound': -0.0388375},
 'heron': {'neg': 0.0,
  'pos': 0.03266666666666667,
  'neu': 0.9673333333333333,
  'compound': 0.025366666666666666},
 'devon': {'neg': 0.006363636363636364,
  'pos': 0.13236363636363635,
  'neu': 0.8612727272727273,
  'compound': 0.2666454545454546},
 'flexion': {'neg': 0.0, 'pos': 0.0, 'neu': 1.0, 'compound': 0.0},
 'select': {'neg': 0.0,
  'pos': 0.13899999999999998

In [22]:
output = vader_analysis(dict_per_investor)
pickle.dump(output, open('dictionary_per_investor_score_vader.p', 'wb'))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


#### FastText

In [23]:
import fastText
import re
import string
import os

In [24]:
MODEL_DIR_PATH = "../../../Downloads/"

model = fastText.load_model(os.path.join(MODEL_DIR_PATH, "amazon_review_full.bin"))

maketrans = str.maketrans

In [26]:
predict_sentiment = lambda s: model.predict(clean_text(s))

In [27]:
def fastext_prediction(dict_per_):
    dict_score = dict()
    for key in list(dict_per_.keys()):
        label, confidence, tmp_dict = [], [], dict()
        for sentence in dict_per_[key]:
            res = predict_sentiment(sentence)
            label.append(int(res[0][0][9]))
            confidence.append(res[1][0])
        tmp_dict['label'] = np.mean(label)
        tmp_dict['confidence'] = np.mean(confidence)
        dict_score[key] = tmp_dict 

    return dict_score

In [28]:
output = fastext_prediction(dict_per_compagny)
pickle.dump(output, open('dictionary_per_compagny_score_fastext.p', 'wb'))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [29]:
output = fastext_prediction(dict_per_investor)
pickle.dump(output, open('dictionary_per_investor_score_fastext.p', 'wb'))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [30]:
output

{'kingdon': {'label': nan, 'confidence': nan},
 'royal': {'label': 3.5945945945945947, 'confidence': 0.6167057704281163},
 'girard': {'label': nan, 'confidence': nan},
 'sandler': {'label': 1.0, 'confidence': 0.5504875779151917},
 'brookfield': {'label': nan, 'confidence': nan},
 'jayhawk': {'label': nan, 'confidence': nan},
 'compass': {'label': 3.1538461538461537, 'confidence': 0.5320910834349118},
 'bell': {'label': 3.122754491017964, 'confidence': 0.5994574547170879},
 'shelton': {'label': 4.0, 'confidence': 0.3571247458457947},
 'shine': {'label': 3.3009708737864076, 'confidence': 0.5968792966557938},
 'fortaleza': {'label': 3.6666666666666665, 'confidence': 0.47730281949043274},
 'new': {'label': 3.195890730796171, 'confidence': 0.5645335080667881},
 'virginia': {'label': 1.0, 'confidence': 0.46404390533765155},
 'honkamp': {'label': nan, 'confidence': nan},
 'osborne': {'label': 3.0, 'confidence': 0.7404687404632568},
 'tompkins': {'label': nan, 'confidence': nan},
 'hardman': {