## Install dependecies

In [2]:
!pip install memory-profiler emoji



In [3]:
# Import modules
import pandas as pd
import json
import cProfile
import emoji
import datetime
from memory_profiler import profile
from typing import List, Tuple

In [4]:
# Auxiliary function to load JSON into Dataframe
def load_JSON_into_df(file_path: str):

    data= []
    invalid_rows= []

    missing_rows = 0

    # Try to read the JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read and parse each line separately
        for row in file:
            try:
                jsonparse = json.loads(row)
                data.append(jsonparse)
            except Exception as e:
                missing_rows +=1
                invalid_rows.append(row)

    print(f'Missing rows: {missing_rows}')
    print(f'Invalid rows: {invalid_rows}')

    return pd.DataFrame(data)

## Data Challenge resolution

### Question 1

The top 10 dates with the most tweets. Mention the user (username) with the most posts for each of those days. This script returns the following data type: List[Tuple[datetime.date, str]]

In [5]:
@profile
def find_top_10_tweets_count_dates(file_path: str) -> List[Tuple[datetime.date, str]]:

    # Create a DataFrame from the parsed data
    df = load_JSON_into_df(file_path)


    # Removing hours, minutes and seconds
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%m-%d-%Y')
    df['date'] = pd.to_datetime(df['date']).dt.date

    # Extracting username information
    df['username'] = df['user'].apply(lambda x: x['username'])

    # Counting tweets aggregated by date + username
    top_users = df.groupby('date')['username'].value_counts().reset_index(name='record_count')

    # Sort the DataFrame by 'record_count' in descending order
    top_users = top_users.sort_values(by=['date', 'record_count'], ascending=[True, False])

    # Top 10 dates and username with most tweets by that given date
    top10_users = top_users.groupby('date').max().reset_index().head(10)


    # Returning result in a tuple
    return [tuple(row[['date', 'username']]) for _, row in top10_users.iterrows()]

In [6]:
top_10_dates = find_top_10_tweets_count_dates("farmers-protest-tweets-2021-2-4.json")
print(top_10_dates)

ERROR: Could not find file /var/folders/35/x4p_8f_n1t7cw1_k8r37b8700000gn/T/ipykernel_43834/3189519986.py
Missing rows: 0
Invalid rows: []
[(datetime.date(2021, 2, 12), 'zsheikh_INC'), (datetime.date(2021, 2, 13), 'zundar'), (datetime.date(2021, 2, 14), 'zlz_raa'), (datetime.date(2021, 2, 15), 'zuberjafri'), (datetime.date(2021, 2, 16), 'zlz_raa'), (datetime.date(2021, 2, 17), 'zlz_raa'), (datetime.date(2021, 2, 18), 'zlz_raa'), (datetime.date(2021, 2, 19), 'zia_khan2k'), (datetime.date(2021, 2, 20), 'zlz_raa'), (datetime.date(2021, 2, 21), 'zoo_bear')]


### Question 2

The top 10 most used emojis with their respective counts. This script returns the following data type: List[Tuple[str, int]]

In [7]:
# Define a function to extract emojis from a string using the emoji module
def extract_emojis(text):
    return ''.join(char for char in text if emoji.is_emoji(char))


@profile
def find_top_10_emojis_count(file_path: str) -> List[Tuple[str, int]]:

    # Create a DataFrame from the parsed data
    df = load_JSON_into_df(file_path)

    # Apply the extract_emojis function to the "content" column
    df['emojis'] = df['content'].apply(extract_emojis)

    # Transforming all emojis coming from tweets in just one line
    emojis_extracted = df['emojis'].str.cat()

    # Dict to save emoji + count
    emj_count = {}

    # Iterate through the emojis
    for emj in emojis_extracted:
        
        # If emoji is not in the dictionary, add it with a count of 1
        if emj not in emj_count:
            emj_count[emj] = 1
        # If the emoji is already in the dictionary, increment its count
        else:
            emj_count[emj] += 1

    # Convert the dictionary to a list of tuples (emoji, count)
    emj_count_tuple = [(emj, count) for emj, count in emj_count.items()]

    # Sort the list of tuples by count in descending order
    sorted_tuples = sorted(emj_count_tuple, key=lambda x: x[1], reverse=True)

    # Return the sorted list of tuples as a tuple. Only keeping the top 10
    return tuple(sorted_tuples[:10])

In [8]:
top_10_emojis = find_top_10_emojis_count("farmers-protest-tweets-2021-2-4.json")
print(top_10_emojis)

ERROR: Could not find file /var/folders/35/x4p_8f_n1t7cw1_k8r37b8700000gn/T/ipykernel_43834/3926645500.py
Missing rows: 0
Invalid rows: []
(('🙏', 7286), ('😂', 3072), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('🏻', 2080), ('❤', 1779), ('🤣', 1668), ('🏽', 1218), ('👇', 1108))


### Question 3

top_10_most_mentioned_users: The historical top 10 most influential users (username) based on the count of mentions (@) each of them registers. This script returns the following data type: List[Tuple[str, int]]

In [9]:
def find_top_10_most_mentioned_users(file_path: str) -> List[Tuple[str, int]]:

    # Create a DataFrame from the parsed data
    df = load_JSON_into_df(file_path)

    # Drop rows with None or NaN values in the "mentionedUsers" column
    # We only want to keep users that were mentioned
    df.dropna(subset=['mentionedUsers'], inplace=True)

    df['usernameMentionedList'] = df['mentionedUsers'].apply(lambda user_list: [user['username'] for user in user_list if 'username' in user])

    username_mentioned_plain_string = ' '.join(' '.join(username_list) for username_list in df['usernameMentionedList'])

    # Create a dictionary that contains the count of each username
    username_mention_count = {}

    for username in username_mentioned_plain_string.split():
        if username in username_mention_count:
            username_mention_count[username] += 1
        else:
            username_mention_count[username] = 1

    # Transform the dictionary into a list of tuples, ordered by descending count
    sorted_username_tuples = sorted(username_mention_count.items(), key=lambda x: x[1], reverse=True)

    # Keep only top 10
    return tuple(sorted_username_tuples[:10])


In [10]:
top_10_mentioned_users = find_top_10_most_mentioned_users("farmers-protest-tweets-2021-2-4.json")
print(top_10_mentioned_users)

Missing rows: 0
Invalid rows: []
(('narendramodi', 2265), ('Kisanektamorcha', 1840), ('RakeshTikaitBKU', 1644), ('PMOIndia', 1427), ('RahulGandhi', 1146), ('GretaThunberg', 1048), ('RaviSinghKA', 1019), ('rihanna', 986), ('UNHumanRights', 962), ('meenaharris', 926))
