### **This notebook gets the original tweets of posters and parse it**

In [121]:
import pandas as pd
import numpy as np
import datetime
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from tqdm import tqdm
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config
import helper.pandas_helper as pd_hp

##### **Load log files**

In [90]:
config = config.config()
path = config['PATHS']
conversation_ids_5 = path['conversation_ids_5']

poster_path = config['POSTER_PATH']
poster_original_tweets_file = poster_path['poster_original_tweets_file']

poster_original_tweets_file

'/N/slate/potem/data/derived/poster_original_tweets/poster_original_tweets.jsonl'

##### **Get all poster original tweets**

In [28]:
def get_tweets(id_file, output_file):
    '''
    Get tweets from tweet id file
    :param id_file: file which contain tweet ids
    :param output_file: file which will save json file
    '''
    command = f'twarc2 hydrate {id_file} {output_file}'
    
    os.system(command) 
    
    
# get_tweets(conversation_ids_5,
#            poster_original_tweets_file)

##### **Parse all original tweets**

In [122]:
import json
import imp

imp.reload(config)

<module 'config.config' from '/geode2/home/u070/potem/Quartz/project/infoOps-strategy/package/config/config.py'>

In [123]:
config = config.config()
poster_path = config['POSTER_PATH']
poster_original_tweets_file = poster_path['poster_original_tweets_file']

In [124]:
parsed_poster_org_tweets = poster_path['parsed_poster_org_tweets']

In [125]:
def get_empty_tweet_dict():
    '''
    Gets the empty tweet dictionary
    
    :return dictionary
    '''
    
    tweet = {
        'text': None,
        'conversation_id': None,
        'lang': None,
        'entities': None,
        'possibly_sensitive': None,
        'reply_settings': None,
        'created_at': None,
        'edit_history_tweet_ids': None,
        'tweetid': None,
        'author_id': None,
        'retweet_count': None,
        'reply_count': None,
        'like_count': None,
        'quote_count': None,
        'impression_count': None,
        'expanded_url': None,
        'display_url': None,
        'in_reply_to_user_id': None,
        'referenced_tweets': None,
        'context_annotations': None,
        'entity_annotations': None,
        'cashtags': None,
        'hashtags': None,
        'mentions': None,
    }
    
    return tweet


def add_entity_values(tweet, values):
    '''
    Parse the entities in tweet
    
    :param tweet: New tweet object which hold values
    :param values: Return object from twitter API
    
    :return dictionary
    '''

    if 'entities' not in values:
        return tweet
    
    if 'urls' in values['entities']:
        urls = values['entities']['urls']
        expanded_url = []
        display_url = []

        for url in urls:
            expanded_url.append(url['expanded_url'])
            display_url.append(url['display_url'])

        tweet['expanded_url'] = expanded_url
        tweet['display_url'] = display_url
    
    if 'annotations' in values['entities']:
        annotations = values['entities']['annotations']
        all_ann = []
        for ann in annotations:
            all_ann.append({'type': ann['type'], 
                            'probability': ann['probability']
                           })
        tweet['entity_annotations'] = all_ann
        
    if 'cashtags' in values['entities']:
        cashtags = values['entities']['cashtags']
        all_ann = []
        for ann in cashtags:
            all_ann.append(ann['tag'])
            
        tweet['cashtags'] = all_ann
        
    if 'hashtags' in values['entities']:
        hashtags = values['entities']['hashtags']
        all_ann = []
        for ann in hashtags:
            all_ann.append(ann['tag'])
            
        tweet['hashtags'] = all_ann
    
    if 'mentions' in values['entities']:
        mentions = values['entities']['mentions']
        all_ann = []
        for ann in mentions:
            all_ann.append([ann['username'], ann['id']])
            
        tweet['mentions'] = all_ann
        
        
    return tweet


def add_public_metric(tweet, values):
    '''
    Adds the public metric values to tweet 
    :param tweet: New tweet object
    :param values: Return object from Twitter API
    
    :return dictionary
    '''
    
    if 'public_metrics' not in values:
        return tweet
    
    metric = values['public_metrics']

    if 'retweet_count' in metric:
        tweet['retweet_count'] = metric['retweet_count']
    if 'reply_count' in metric:
        tweet['reply_count'] = metric['reply_count']
    if 'like_count' in metric:
        tweet['like_count'] = metric['like_count']
    if 'quote_count' in metric:
        tweet['quote_count'] = metric['quote_count']
    if 'impression_count' in metric:
        tweet['impression_count'] = metric['impression_count']
        
    return tweet
   
    
def set_tweet_values(values):
    '''
    Set the values of tweet object
    
    :return dictionary
    '''
    
    tweet = get_empty_tweet_dict()
    
    tweet['text'] = values['text']
    tweet['conversation_id'] = values['conversation_id']
    tweet['lang'] = values['lang']
    tweet['possibly_sensitive'] = values['possibly_sensitive']
    tweet['reply_settings'] = values['reply_settings']
    tweet['created_at'] = values['created_at']
    tweet['edit_history_tweet_ids'] = values['edit_history_tweet_ids']
    tweet['tweetid'] = values['id']
    tweet['author_id'] = values['author_id']
    
    if 'in_reply_to_user_id' in values:
        tweet['in_reply_to_user_id'] = values['in_reply_to_user_id']
    if 'referenced_tweets' in values:
        tweet['referenced_tweets'] = values['referenced_tweets']
    
    tweet = add_entity_values(tweet, values)
    tweet = add_public_metric(tweet, values)
    
    if 'context_annotations' in values:
        tweet['context_annotations'] = values['context_annotations']
            
    return tweet

def set_values_for_tweet_with_error(values):
    '''
    Sets values for tweets not found
    :param values: return object of twitter API
    
    :return dictionary
    '''

    if values['resource_type'] != 'tweet':
        return None
    
    if 'referenced_tweets' in values['detail']:
        return None
    
    tweet = get_empty_tweet_dict()
    tweet['tweetid']= values['value']
    tweet['text'] = values['detail']
    tweet['author_id'] = None
    tweet['created_at'] = values['title']
    
    return tweet
    

def parse_tweets(tweet_file, output_file=None):
    all_tweets = []
    total = 0
    with open(tweet_file, 'r') as json_file:
        for row in json_file:
            one_row = json.loads(row)
            
            if 'errors' in one_row:
                for values in one_row['errors']:
                    tweet = set_values_for_tweet_with_error(values)
                    if tweet != None:
                        all_tweets.append(tweet)
            
            if 'data' not in one_row:
                continue
            for tweet in one_row['data']:
                tweet_set = set_tweet_values(tweet)
                
                all_tweets.append(tweet_set)
                
    df = pd.DataFrame.from_records(data=all_tweets)
    
    if output_file is not None:
        df.to_pickle(output_file)

    return df          
            
df = parse_tweets(poster_original_tweets_file, parsed_poster_org_tweets)

96041
96041


In [128]:
df = pd.read_pickle(parsed_poster_org_tweets)
ids = file_hp.read_file(conversation_ids_5)
print('Total original tweets :', len(set(ids)))
print('Total parsed tweets :', len(df))

Total original tweets : 96041
Total parsed tweets : 96041


#####  **More info**
author_id is None for tweets not found

