In [None]:
# Import libraries
import re
import json
import emoji
import numpy as np
import pandas as pd
from datetime import datetime as dt
from nltk.tokenize.casual import TweetTokenizer

In [None]:
# Extract json into dictionary
def load_json_data(dataset_name):
    return json.load(open('../data/' + dataset_name + '.json'))

In [None]:
# Function to get sentiment labels of tweets by ID (training dataset only)
def sentiment_labels():
    dataset = load_json_data('training')
    tweet_ids = []
    labels = []
    
    for tweet, details in dataset.items():
        tweet_ids.append(tweet)
        labels.append(details['sentiment_label'])
    return pd.DataFrame({'Tweet ID': tweet_ids, 'Sentiment Label': labels})

In [None]:
# Function to collect all tags (mentions & hashtags) of tweets in dataset
def get_tags(dataset_name, tag_type):
    dataset = load_json_data(dataset_name)
    tag_identifier = 'tag' if tag_type == 'hashtags' else 'username'
    tweet_ids = []
    tweet_tags = []
    
    for tweet, details in dataset.items():
        if (details['tweet_data'] and
            ('entities' in details['tweet_data']) and
            (tag_type in details['tweet_data']['entities'])):
            for tag in details['tweet_data']['entities'][tag_type]:
                tweet_ids.append(tweet)
                tweet_tags.append(tag[tag_identifier])
        else:
            tweet_ids.append(tweet)
            tweet_tags.append(None)
    return pd.DataFrame({'Tweet ID': tweet_ids, tag_type.capitalize(): tweet_tags})

In [None]:
# Function to collect sources (devices) of tweets in dataset
def get_sources(dataset_name):
    dataset = load_json_data(dataset_name)
    tweet_ids = []
    tweet_sources = []
    
    for tweet, details in dataset.items():
        tweet_ids.append(tweet)
        tweet_sources.append(details['tweet_data']['source'] if details['tweet_data'] else None)
    return pd.DataFrame({'Tweet ID': tweet_ids, 'Tweet Source': tweet_sources})

In [None]:
# Function to see which users are verified or not
def get_verifications(dataset_name):
    dataset = load_json_data(dataset_name)
    tweet_ids = []
    users_is_verified = []
    
    for tweet, details in dataset.items():
        tweet_ids.append(tweet)
        users_is_verified.append(details['user_data']['verified'] if details['user_data'] else None)
    return pd.DataFrame({'Tweet ID': tweet_ids, 'User Verified': users_is_verified})

In [None]:
# Function to calculate time difference between date of tweet and creation of the user account
def calculate_time_difference(dataset_name):
    dataset = load_json_data(dataset_name)
    tweet_ids = []
    time_difference = []
    
    for tweet, details in dataset.items():
        tweet_ids.append(tweet)
        if details['tweet_data']:
            user_created_at = dt.strptime(details['user_data']['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
            tweet_created_at = dt.strptime(details['tweet_data']['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
            time_diff = tweet_created_at - user_created_at
            time_difference.append(time_diff.days)
        else:
            time_difference.append(None)

    return pd.DataFrame({'Tweet ID': tweet_ids, 'Time Difference': time_difference})

In [None]:
def tweet_text(dataset_name):
    dataset = load_json_data(dataset_name)
    tweet_ids = []
    tweet_text = []
    
    for tweet, details in dataset.items():
        tweet_ids.append(tweet)
        tweet_text.append(details['tweet_data']['text'] if details['tweet_data'] else None)
    return pd.DataFrame({'Tweet ID': tweet_ids, 'Tweet Text': tweet_text})

In [None]:
def character_count(dataset_name):
    dataset = load_json_data(dataset_name)
    tweet_ids = []
    tweet_char_counter = []
    
    for tweet, details in dataset.items():
        tweet_ids.append(tweet)
        if details['tweet_data']:
            # Remove links
            text = re.sub(r'http\S+', '', details['tweet_data']['text'])
            tweet_char_counter.append(len(text))
        else:
            tweet_char_counter.append(np.nan)
        
    return pd.DataFrame({'Tweet ID': tweet_ids, 'Character Count': tweet_char_counter})

In [None]:
def get_emojis(dataset_name):
    dataset = load_json_data(dataset_name)
    tweet_ids = []
    emojis = []
    
    for tweet, details in dataset.items():
        if details['tweet_data']:
            for char in details['tweet_data']['text']:
                if char in emoji.UNICODE_EMOJI['fr']:
                    tweet_ids.append(tweet)
                    emojis.append(char)
        
    return pd.DataFrame({'Tweet ID': tweet_ids, 'Emoji': emojis})

In [None]:
labels = sentiment_labels()
text = tweet_text('training')
emojis = get_emojis('training')
sources = get_sources('training')
char_count = character_count('training')
hashtags = get_tags('training', 'hashtags')
mentions = get_tags('training', 'mentions')
verifications = get_verifications('training')
time_differences = calculate_time_difference('training')

In [None]:
labels['Sentiment Label'].value_counts()

In [None]:
print(hashtags.groupby('Tweet ID').count().apply(lambda x: x >= 1).value_counts())
print(mentions.groupby('Tweet ID').count().apply(lambda x: x >= 1).value_counts())

In [None]:
pd.merge(labels, sources, how='inner').groupby(
    ['Sentiment Label', 'Tweet Source']
).count().sort_values(
    ['Sentiment Label', 'Tweet ID'], ascending=[False, False]
).query('`Tweet ID` > 100')

In [None]:
pd.merge(labels, verifications, how='inner').groupby(
    ['Sentiment Label', 'User Verified']
).count().sort_values(
    ['Sentiment Label', 'Tweet ID'], ascending=[False, False]
)

In [None]:
pd.merge(labels, emojis, how='inner').groupby(
    ['Sentiment Label', 'Emoji']
).count().sort_values(
    ['Sentiment Label', 'Tweet ID'], ascending=[False, False]
).query('`Tweet ID` > 25')

In [None]:
pd.merge(labels, time_differences, how='inner').groupby(['Sentiment Label']).mean()

In [None]:
pd.merge(labels, char_count, how='inner').groupby(['Sentiment Label']).mean()