En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

In [4]:
file_path = "farmers-protest-tweets-2021-2-4.json"

# testing
## testing
### testing

In [8]:
#%pip install memory_profiler

Note: you may need to restart the kernel to use updated packages.


In [10]:
%load_ext memory_profiler

In [5]:
# %load q1_time.py
import json
from datetime import datetime
from collections import defaultdict
from typing import List, Tuple
import time

def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    start_time = time.time()  # Record the start time
    
    date_tweet_count = defaultdict(int)
    date_user_tweets = defaultdict(lambda: defaultdict(int))

    with open(file_path, 'r', encoding='utf-8') as json_file:
        for line in json_file:
            tweet = json.loads(line)
            date_str = tweet["date"][:10]  # Extract date part from the datetime string
            date_tweet_count[date_str] += 1
            date_user_tweets[date_str][tweet["user"]["username"]] += 1

    top_dates_users = []
    for date, user_tweets in date_user_tweets.items():
        top_user = max(user_tweets, key=user_tweets.get)
        top_dates_users.append((datetime.strptime(date, '%Y-%m-%d').date(), top_user))

    end_time = time.time()  # Record the end time
    execution_time = end_time - start_time
    print(f"Execution Time: {execution_time} seconds")

    return sorted(top_dates_users, key=lambda x: date_tweet_count[x[0]], reverse=True)[:10]

# Example usage:
file_path = "/Users/marlonoliveira/Downloads/farmers-protest-tweets-2021-2-4.json"
result = q1_time(file_path)
print(result)

# Optimizations made:

# Instead of using sorted twice, use a single sorted call on top_dates_users with a custom key function to prioritize dates by tweet count.
# Use max directly on user_tweets to find the top user, avoiding unnecessary conversion to items.
# These optimizations should result in a slightly faster execution time. Keep in mind that the performance gain might vary depending on the size of your dataset and specific conditions.

Execution Time: 4.617042303085327 seconds
[(datetime.date(2021, 2, 24), 'preetysaini321'), (datetime.date(2021, 2, 23), 'Surrypuria'), (datetime.date(2021, 2, 22), 'preetysaini321'), (datetime.date(2021, 2, 21), 'Surrypuria'), (datetime.date(2021, 2, 20), 'MangalJ23056160'), (datetime.date(2021, 2, 19), 'Preetm91'), (datetime.date(2021, 2, 18), 'neetuanjle_nitu'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 15), 'jot__b')]


In [14]:
%memit q1_time(file_path)

Execution Time: 4.656842231750488 seconds
peak memory: 62.70 MiB, increment: 3.23 MiB


In [6]:
%prun q1_time(file_path)
#%timeit q1_time(file_path)
#%time q1_time(file_path)
#result = q1_time(file_path)
#print(result)

Execution Time: 4.898242950439453 seconds
 

In [12]:
# %load q1_memory.py
import json
from datetime import datetime
from collections import defaultdict
from typing import List, Tuple

def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
    date_tweet_count = defaultdict(int)
    date_user_tweets = defaultdict(lambda: defaultdict(int))

    with open(file_path, 'r', encoding='utf-8') as json_file:
        for line in json_file:
            tweet = json.loads(line)
            date_str = tweet["date"][:10]  # Extract date part from the datetime string
            date_tweet_count[date_str] += 1
            date_user_tweets[date_str][tweet["user"]["username"]] += 1

    top_dates_users = []
    for date, user_tweets in date_user_tweets.items():
        top_user = max(user_tweets, key=user_tweets.get)
        top_dates_users.append((datetime.strptime(date, '%Y-%m-%d').date(), top_user))

    return sorted(top_dates_users, key=lambda x: date_tweet_count[x[0]], reverse=True)[:10]

# Example usage:
#file_path = "/Users/marlonoliveira/Downloads/farmers-protest-tweets-2021-2-4.json"
#result = q1_memory(file_path)
#print(result)

# Optimizations made to reduce memory usage:

# Process the JSON file line by line, minimizing the amount of data loaded into memory at once.
# Only store necessary information (date, tweet count, and user tweets) in dictionaries.
# These optimizations should help reduce memory usage while still achieving the desired functionality.


In [16]:
%time q1_memory(file_path)

CPU times: user 4.33 s, sys: 380 ms, total: 4.71 s
Wall time: 4.73 s


[(datetime.date(2021, 2, 24), 'preetysaini321'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 22), 'preetysaini321'),
 (datetime.date(2021, 2, 21), 'Surrypuria'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 19), 'Preetm91'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 15), 'jot__b')]

In [15]:
%memit q1_memory(file_path)

peak memory: 64.17 MiB, increment: 0.32 MiB


In [None]:
# %load q2_time.py
import json
import re
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple

def extract_emojis(text: str) -> List[str]:
    emoji_pattern = r'\uD83C[\uDF00-\uDFFF]|\uD83D[\uDC00-\uDDFF]|\uD83E[\uDD00-\uDDFF]|[\u2600-\u2B55]'
    emojis = [match.group() for match in re.finditer(emoji_pattern, text)]
    return emojis

def process_tweet(line):
    tweet = json.loads(line)
    emojis = extract_emojis(tweet["content"])
    return emojis

def q2_time(file_path: str) -> List[Tuple[str, int]]:
    emoji_counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as json_file:
        with ThreadPoolExecutor() as executor:
            emojis_lists = list(executor.map(process_tweet, json_file))

    for emojis in emojis_lists:
        emoji_counter.update(emojis)

    # Get the top 10 most used emojis
    top_emojis = emoji_counter.most_common(10)

    return top_emojis

# Example usage:
file_path = "/Users/marlonoliveira/Downloads/farmers-protest-tweets-2021-2-4.json"
result = q2_time(file_path)
print(result)


#This modification uses the ThreadPoolExecutor to concurrently process tweets and extract emojis, which can improve the overall execution time, especially when dealing with a large number of tweets. Keep in mind that the effectiveness of parallelization depends on factors such as the number of available CPU cores and the nature of the processing tasks.


In [None]:
# %load q2_memory.py
import json
import re
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple

def extract_emojis(text: str) -> List[str]:
    emoji_pattern = r'\uD83C[\uDF00-\uDFFF]|\uD83D[\uDC00-\uDDFF]|\uD83E[\uDD00-\uDDFF]|[\u2600-\u2B55]'
    emojis = [match.group() for match in re.finditer(emoji_pattern, text)]
    return emojis

def process_tweet(line, emoji_counter):
    tweet = json.loads(line)
    emojis = extract_emojis(tweet["content"])
    emoji_counter.update(emojis)

def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    emoji_counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as json_file:
        with ThreadPoolExecutor() as executor:
            executor.map(lambda line: process_tweet(line, emoji_counter), json_file)

    # Get the top 10 most used emojis
    top_emojis = emoji_counter.most_common(10)

    return top_emojis


# Example usage:
file_path = "/Users/marlonoliveira/Downloads/farmers-protest-tweets-2021-2-4.json"
result = q2_memory(file_path)
print(result)

# In this version, we process each line individually, and as soon as we extract the emojis from a tweet, we update the Counter. This way, we avoid keeping a large list of emojis in memory.

# This optimization should help reduce memory usage, especially when dealing with a large number of tweets in the JSON file.

In [None]:
# %load q3_time.py
import json
import re
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple

def extract_mentions(text: str) -> List[str]:
    mention_pattern = r'@(\w+)'  # Adjust the regex pattern as needed
    mentions = re.findall(mention_pattern, text)
    return mentions

def process_tweet(line, mention_counter):
    tweet = json.loads(line)
    mentions = extract_mentions(tweet["content"])
    mention_counter.update(mentions)

def q3_time(file_path: str) -> List[Tuple[str, int]]:
    mention_counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as json_file:
        with ThreadPoolExecutor() as executor:
            executor.map(lambda line: process_tweet(line, mention_counter), json_file)

    # Get the top 10 most mentioned users
    top_mentions = mention_counter.most_common(10)

    return top_mentions

# Example usage:
file_path = "/Users/marlonoliveira/Downloads/farmers-protest-tweets-2021-2-4.json"
result = q3_time(file_path)
print(result)

# This version uses ThreadPoolExecutor to parallelize the processing of tweets. The process_tweet function extracts mentions from each tweet, and the ThreadPoolExecutor efficiently distributes the workload across multiple threads.

# Note: The effectiveness of parallelization depends on the number of available CPU cores and the nature of the processing tasks. Adjust the regex pattern (mention_pattern) based on your specific dataset.

In [None]:
# %load q3_memory.py
import json
import re
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple

def extract_mentions(text: str) -> List[str]:
    mention_pattern = r'@(\w+)'  # Adjust the regex pattern as needed
    mentions = re.findall(mention_pattern, text)
    return mentions

def process_tweet(line, mention_counter):
    tweet = json.loads(line)
    mentions = extract_mentions(tweet["content"])
    mention_counter.update(mentions)

def q3_memory(file_path: str) -> List[Tuple[str, int]]:
    mention_counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as json_file:
        with ThreadPoolExecutor() as executor:
            executor.map(lambda line: process_tweet(line, mention_counter), json_file)

    # Get the top 10 most mentioned users
    top_mentions = mention_counter.most_common(10)

    return top_mentions


# Example usage:
file_path = "/Users/marlonoliveira/Downloads/farmers-protest-tweets-2021-2-4.json"
result = q3_memory(file_path)
print(result)

# To optimize the code for memory usage, we can make some modifications to reduce the memory footprint. Specifically, we can avoid storing the entire list of mentions in memory and update the counter directly as we process each line

# In this version, the mention_counter is updated directly without storing the entire list of mentions in memory. This optimization helps reduce the memory footprint, especially when dealing with a large number of tweets in the JSON file.

In [17]:
import requests

# Define the URL and JSON payload
url = "https://advana-challenge-check-api-cr-k4hdbggvoq-uc.a.run.app/data-engineer"
payload = {
    "name": "Marlon Oliveira",
    "mail": "oliwer.marlon@gmail.com",
    "github_url": "https://github.com/marlondcu/latam-challenge.git"
}

# Make the POST request
response = requests.post(url, json=payload)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    print("POST request was successful!")
    print("Response:", response.text)
else:
    print("POST request failed with status code:", response.status_code)


POST request was successful!
Response: {"status":"OK","detail":"your request was received"}
