In [12]:
import json
import pandas as pd
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer
from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer

In [13]:
SIA = SentimentIntensityAnalyzer()

In [14]:
def load_json_data(dataset_name):
    return json.load(open('../data/' + dataset_name + '.json'))

In [15]:
def preprocess_tweet(tweet_text):
    return tweet_text

In [16]:
def extract_tweets(dataset_name):
    dataset = load_json_data(dataset_name)
    load = {}
    for tweet, details in dataset.items():
        if details['tweet_data']:
            load[tweet] = preprocess_tweet(details['tweet_data']['text'])
        else:
            load[tweet] = details['previous_processed_text']
    return load

In [17]:
def get_labels_rule_based(tweet_text):
    vader_scores = SIA.polarity_scores(tweet_text)
    textblob_scores = TextBlob(tweet_text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment
    if vader_scores['compound'] > 0.05:
        vader_label = 'pos'
    elif vader_scores['compound'] < -0.05:
        vader_label = 'neg'
    else:
        vader_label = 'neu'
    if textblob_scores[0] > 0.1:
        textblob_label = 'pos'
    elif textblob_scores[0] < -0.1:
        textblob_label = 'neg'
    else:
        textblob_label = 'neu'
    return (vader_label, textblob_label)

In [45]:
def training_sentiment():
    dataset = extract_tweets('training')
    training = load_json_data('training')
    #tweet_ids = []
    tweet_texts = []
    training_labels = []
    
    for tweet in dataset:
        #tweet_ids.append(tweet)
        tweet_texts.append(dataset[tweet])
        training_labels.append(training[tweet]['sentiment_label'])
    #return pd.DataFrame({'Tweet ID': tweet_ids,
    #                     'Tweet Text': tweet_texts,
    #                     'Training Labels': training_labels})
    return pd.DataFrame({'Training Labels': training_labels})

In [46]:
def rule_based_sentiment(dataset_name):
    dataset = extract_tweets(dataset_name)
    #tweet_ids = []
    tweet_texts = []
    vader_labels = []
    textblob_labels = []
    
    for tweet in dataset:
        #tweet_ids.append(tweet)
        tweet_texts.append(dataset[tweet])
        rule_based_labels = get_labels_rule_based(dataset[tweet])
        vader_labels.append(rule_based_labels[0])
        textblob_labels.append(rule_based_labels[1])
        
    #return pd.DataFrame({'Tweet ID': tweet_ids,
    #                     'Tweet Text': tweet_texts,
    #                     'VADER Labels': vader_labels,
    #                     'TextBlob Labels': textblob_labels})
    return pd.DataFrame({'VADER Labels': vader_labels,
                         'TextBlob Labels': textblob_labels})

In [47]:
real_training = training_sentiment()
rule_based_training = rule_based_sentiment('training')
rule_based_prediction_macron = rule_based_sentiment('prediction_macron')
rule_based_prediction_lepen = rule_based_sentiment('prediction_lepen')

In [50]:
joined = pd.concat([real_training, rule_based_training], axis=1)
joined = joined.loc[:,~joined.columns.duplicated()]

In [51]:
joined[joined.apply(lambda x: min(x) == max(x), 1)]

Unnamed: 0,Training Labels,VADER Labels,TextBlob Labels
6,neg,neg,neg
15,neu,neu,neu
25,neg,neg,neg
28,pos,pos,pos
29,neu,neu,neu
...,...,...,...
8568,neu,neu,neu
8582,neg,neg,neg
8584,pos,pos,pos
8586,pos,pos,pos
