# 1. Importing Libraries and Defining Constants

In [264]:
import json
import pandas as pd
from datetime import datetime

In [225]:
ENCODING = 'utf-8'  # Encoding to read tweets which are in Arabic, English and contain some emojis.

In [243]:
'''
The following dictionaries are used to store the data from the tweets and retweets. The data is stored in the following format for tweets and retweets:
    {
        'likes': [],
        'retweets': [],
        'date': [],
        'hour': [],
        'language': [],
    }

For tweet content, the data is stored in the following format:
    {
        'content': [],
        'language': [],
        'mentions': [],
        'hashtags':[],
        'tweet/retweet': [] # This is used to differentiate between tweets and retweets.
    }
'''
tweets_data = {
    'likes': [],
    'retweets': [],
    'date': [],
    'hour': [],
    'language': [],
}

retweets_data = {
    'likes': [],
    'retweets': [],
    'date': [],
    'hour': [],
    'language': [],
}

tweet_content = {
    'content': [],
    'language': [],
    'mentions': [],
    'hashtags':[],
    'tweet/retweet': [], # 0 for tweet, 1 for retweet
    'is_quote': [], # 0 for not quote, 1 for quote
    'date': [],
    'hour': []
}

circle_tweets_data = {
    'likes': [],
    'retweets': [],
    'date': [],
    'hour': [],
    'language': [],
    
}

circle_tweet_content = {
    'content': [],
    'language': [],
    'mentions': [],
    'hashtags':[],
    'is_quote': [], # 0 for not quote, 1 for quote
    'date': [],
    'hour': []
}

tweet_or_retweet = {
    0: 'tweet',
    1: 'retweet',
}

In [265]:
personalization_data = {
    'languages': [],
    'gender_info': [],
    'interests': [],
    'inferred_age_info': [],
    'location_history': [],
    'shows': [],
}

In [302]:
ad_engagements_data = {
    'targeting type/targeting value': [], # (targeting type, targeting value, date)
    'engagements': [] # (engagement, date)
}

# 2. Importing Data

Data will be read and processed in this section.

### Defining functions

In [267]:
def read_file(filename):
    '''
    This function takes in a file as a parameter and reads it with the provided encoding and loads. It returns a json object.
    ---
    Parameters:
        filename: str
    ---
    Returns:
        data: json object
    '''
    with open(filename, 'r', encoding=ENCODING) as file:
        data = file.read()
        json_data = ("=").join(data.split("=")[1:])
        data = json.loads(json_data)
    return data

In [268]:
def convert_to_datetime(timestamp_str):
    '''
    This function takes in a timestamp string and returns the date and time.
    ---
    Parameters:
        timestamp_str: str
    ---
    Returns:
        date: str YYYY-MM-DD
        time: int HH (24 Hour)
    '''

    timestamp_object = datetime.strptime(timestamp_str, '%a %b %d %H:%M:%S %z %Y')

    date = timestamp_object.date()
    time = timestamp_object.time().strftime('%H') # The rest of the time data is turnaceted as it does not add to the statistics.

    return date, int(time)

In [231]:
def get_mentions(entities):
    '''
    This function takes in the entities object from the json object and returns a list of mentions. 
    ---
    Parameters:
        entities: json object
    ---
    Returns:
        mentions_list: list
    '''
    mentions = entities['user_mentions']
    mentions_list = []
    for i in range(len(mentions)):
        mentions_list.append(mentions[i]['screen_name'])
    return mentions_list

In [232]:
def is_quote(entities):
    '''
    This function takes in the entities object from the json object and returns a boolean value indicating whether the tweet is a quote or not.
    ---
    Parameters:
        entities: json object
    ---
    Returns:
        is_quote: bool
    '''
    urls = entities['urls']
    if len(urls) > 0:
        for elem in urls:
            if 'https://t.co' in elem['url']:
                 return True
    return False

In [241]:
def populate_dictionaries(data, is_circle=False):
    '''
    This function takes in a json object and populates the dictionaries with the data from the json object.
    ---
    Parameters:
        data: json object
    ---
    Returns:
        None
    '''

    for i in range(len(data)):
        tweet_data = data[i]['tweet']
        is_retweet = False
        if tweet_data['full_text'][0:2] == 'RT':
            is_retweet = True
        likes = tweet_data['favorite_count']
        retweets = tweet_data['retweet_count']
        date, time = convert_to_datetime(tweet_data['created_at'])
        if 'lang' in list(tweet_data.keys()):
            language = tweet_data['lang']
        else:
            language = 'und'
        
        if not is_circle:
            if is_retweet:
                retweets_data['likes'].append(likes)
                retweets_data['retweets'].append(retweets)
                retweets_data['date'].append(date)
                retweets_data['hour'].append(time)
                retweets_data['language'].append(language)
            else:
                tweets_data['likes'].append(likes)
                tweets_data['retweets'].append(retweets)
                tweets_data['date'].append(date)
                tweets_data['hour'].append(time)
                tweets_data['language'].append(language)
            
            tweet_content['content'].append(tweet_data['full_text'])
            tweet_content['language'].append(language)
            tweet_content['mentions'].append(get_mentions(entities=tweet_data['entities']))
            tweet_content['tweet/retweet'].append(int(is_retweet))   
            tweet_content['hashtags'].append(list(tweet_data['entities']['hashtags']))
            tweet_content['is_quote'].append(int(is_quote(entities=tweet_data['entities'])))
            tweet_content['date'].append(date)
            tweet_content['hour'].append(time)
        else:
            circle_tweets_data['likes'].append(likes)
            circle_tweets_data['retweets'].append(retweets)
            circle_tweets_data['date'].append(date)
            circle_tweets_data['hour'].append(time)
            circle_tweets_data['language'].append(language)
            
            circle_tweet_content['content'].append(tweet_data['full_text'])
            circle_tweet_content['language'].append(language)
            circle_tweet_content['mentions'].append(get_mentions(entities=tweet_data['entities']))
            circle_tweet_content['is_quote'].append(int(is_quote(entities=tweet_data['entities'])))
            circle_tweet_content['hashtags'].append(list(tweet_data['entities']['hashtags']))
            circle_tweet_content['date'].append(date)
            circle_tweet_content['hour'].append(time)

In [234]:
def read_personalization(data):
    '''
    This funtion takes a json object as its parameter and updates the personalization data accordingly.
    ---
    Parameters:
        json object
    ---
    Return:
        None 
    '''
    languages_data = [element['language'].lower().strip() for element in data['demographics']['languages']]
    interests_data = list(set([element['name'].strip().lower() for element in data['interests']['interests']]))
    shows_data = list(set(show.lower().strip() for show in data['interests']['shows']))

    personalization_data['languages'] = languages_data
    personalization_data['interests'] = interests_data
    personalization_data['shows'] = shows_data
    personalization_data['inferred_age_info'] = data['inferredAgeInfo']
    personalization_data['location_history'] = data['locationHistory']
    personalization_data['gender_info'] = data['demographics']['genderInfo']['gender']
    

In [303]:
def read_ad_data(data):
    '''
    This funtion takes a json object as its parameter and updates the ad data accordingly.
    ---
    Parameters:
        json object
    ---
    Return:
        None 
    '''
    
    # ad_engagements[0]['ad']['adsUserData']['adEngagements']['engagements'][0]['impressionAttributes']['impressionTime']
    for i in range(len(data)):
        ad_data = data[i]['ad']['adsUserData']['adEngagements']['engagements']
        for j in range(len(ad_data)):
            current_eng_attributes_list = ad_data[j]['engagementAttributes']
            current_eng_date = ad_data[j]['impressionAttributes']['impressionTime'].split(' ')[0]
            if 'matchedTargetingCriteria' in list(ad_data[j]['impressionAttributes'].keys()):
                current_target_attributes_list = ad_data[j]['impressionAttributes']['matchedTargetingCriteria']
                for k in range(len(current_target_attributes_list)):
                    if 'targetingValue' not in list(current_target_attributes_list[k].keys()):
                        current_target_attributes_list[k]['targetingValue'] = None
                    else:
                        targeting_type = str(current_target_attributes_list[k]['targetingType']).lower()
                        targeting_value = str(current_target_attributes_list[k]['targetingValue']).lower()
                        type_value = (targeting_type, targeting_value, current_eng_date)
                        ad_engagements_data['targeting type/targeting value'].append(type_value)
                    if k < len(current_eng_attributes_list):
                        ad_engagements_data['engagements'].append((current_eng_attributes_list[k]['engagementType'], current_eng_date))
                    else:
                        ad_engagements_data['engagements'].append((None, current_eng_date))
            

### Reading Data

In [236]:
tweets = read_file(filename='tweets.js')
circle_tweets = read_file(filename='twitter-circle-tweet.js')

In [237]:
likes = read_file(filename='like.js')

In [238]:
personalization = read_file(filename='personalization.js')
personalization_dict = personalization[0]['p13nData']
read_personalization(data=personalization_dict)

In [239]:
# This cell extracts the full text of the liked tweet and saves it in a dictionary. Some tweets have no \
# "full text" attribute. This is taken care of by guarding the extraction statements.
likes_content = []
missing_content = 0
for like in likes:
    like = like['like']
    if 'fullText' in list(like.keys()):
        likes_content.append(like['fullText'])
    else:
        missing_content += 1

In [244]:
populate_dictionaries(data=tweets)
populate_dictionaries(data=circle_tweets, is_circle=True)

In [304]:
ad_engagements = read_file(filename='ad-engagements.js')

In [305]:
read_ad_data(data=ad_engagements)

# 3. Write Data to CSV
Note that the data saved to the CSV files is not yet cleaned. This will be done in the next notebook.

### Defining functions

In [248]:
def create_dataframe(data):
    '''
    This function takes in a dictionary and returns a dataframe.
    ---
    Parameters:
        data: dict
    ---
    Returns:
        df: pandas dataframe
    '''
    df = pd.DataFrame(data)
    return df

### Writing Data

In [249]:
# First, read into dataframes
tweets_df = create_dataframe(data=tweets_data)
retweets_df = create_dataframe(data=retweets_data)
tweet_content_df = create_dataframe(data=tweet_content)

In [250]:
circle_tweets_df = create_dataframe(data=circle_tweets_data)
circle_tweets_content_df = create_dataframe(data=circle_tweet_content)

In [251]:
tweets_df.to_csv('tweets.csv', index=False)
retweets_df.to_csv('retweets.csv', index=False)
tweet_content_df.to_csv('tweet_content.csv', index=False)

In [252]:
circle_tweets_df.to_csv('circle_tweets.csv', index=False)
circle_tweets_content_df.to_csv('circle_tweet_content.csv', index=False)

In [253]:
likes_data = {'likes': likes_content}
likes_df = pd.DataFrame(data=likes_data)

In [254]:
likes_df.to_csv('likes.csv', index=False)

In [255]:
json.dump(personalization_data, open('personalization.json', 'w', encoding=ENCODING))

In [256]:
interests_df = pd.DataFrame(data={'interests': list(set(personalization_data['interests']))})
interests_df.to_csv('interests.csv', index=False)

In [258]:
shows_df = create_dataframe(data={'shows': list(set(personalization_data['shows']))})
shows_df.to_csv('shows.csv', index=False)

In [309]:
ad_interests_df = create_dataframe(data= {'targeting type':[element[0] for element in ad_engagements_data['targeting type/targeting value']], 'targeting value': [element[1] for element in ad_engagements_data['targeting type/targeting value']], 'date': [element[2] for element in ad_engagements_data['targeting type/targeting value']]})
ad_engagements_df = create_dataframe(data={'engagements': [element[0] for element in ad_engagements_data['engagements']], 'engagement date': [element[1] for element in ad_engagements_data['engagements']]})

In [311]:
ad_interests_df.to_csv('interests.csv', index=False)
ad_engagements_df.to_csv('engagements.csv', index=False)