# Setup and Import Required Libraries

This section imports the necessary modules and libraries for the notebook, including autoreload for development purposes.

In [1]:
%load_ext autoreload
%autoreload 2

# Data Download and Preparation

This section downloads the dataset from Google Drive and prepares it for analysis by extracting and loading the JSON file.

In [2]:
from src import utils , q1_memory , q1_time , q2_memory , q2_time , q3_memory , q3_time
import cProfile
import pstats

In [3]:
file_path = "farmers-protest-tweets-2021-2-4.json"
url = "https://drive.google.com/uc?id=1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis"
output = "data/tweets.json.zip"

In [4]:
utils.get_file_Drive(url=url, output=output)

# Profiler Setup

In this section, the profiler is initialized to measure performance of different functions for time and memory usage.

In [5]:
profiler = cProfile.Profile()

# Q1
The top 10 dates where there are more tweets. Mention the user (username) that has more publications for each of those days.

----

## q1_memory

In [13]:

profiler.enable()

result_q1_memory = q1_memory.q1_memory(output)

profiler.disable()

stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()



Filename: C:\Users\nmendez\Documents\MAESTRIA\tweetsProcessor\src\q1_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    10     87.8 MiB     87.8 MiB           1   @profile
    11                                         def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
    12     87.8 MiB      0.0 MiB           1       date_user_counter = defaultdict(Counter)
    13                                             
    14     94.7 MiB      0.0 MiB           2       with zipfile.ZipFile(file_path, 'r') as z:
    15     94.7 MiB      0.0 MiB           2           with z.open(z.namelist()[0]) as f:
    16     87.8 MiB      0.0 MiB           1               buffer = []
    17     95.1 MiB -109297.6 MiB      117408               for line in f:
    18     95.1 MiB -109997.8 MiB      117407                   tweet = json.loads(line)
    19     95.1 MiB -109269.8 MiB      117407                   buffer.append(tweet)
    20                                   

<pstats.Stats at 0x2490d796f20>

In [14]:
result_q1_memory

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

## Q1_TIME

In [11]:
from src import  q1_time
profiler.enable()

result_q1_time = q1_time.q1_time(output)

profiler.disable()
stats    = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()

Filename: C:\Users\nmendez\Documents\MAESTRIA\tweetsProcessor\src\q1_time.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     8     87.6 MiB     87.6 MiB           1   @profile
     9                                         def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    10     87.6 MiB      0.0 MiB           1       date_user_counter = defaultdict(Counter)
    11                                             
    12     89.9 MiB      0.0 MiB           2       with zipfile.ZipFile(file_path, 'r') as z:
    13     89.9 MiB      0.0 MiB           2           with z.open(z.namelist()[0]) as f:
    14     89.9 MiB    -29.3 MiB      117408               for line in f:
    15     89.9 MiB    -40.6 MiB      117407                   tweet = json.loads(line)
    16     89.9 MiB    -30.5 MiB      117407                   date = tweet.get('date')
    17     89.9 MiB    -30.5 MiB      117407                   username = tweet.get('user', {}).get('username')
   

<pstats.Stats at 0x2490eccc850>

In [12]:
result_q1_time

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

---


# Q2

In [19]:
profiler = cProfile.Profile()
profiler.enable()

result_q2_time = q2_time.q2_time(output)

profiler.disable()

stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()



Filename: C:\Users\nmendez\Documents\MAESTRIA\tweetsProcessor\src\q2_time.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    15    126.9 MiB    126.9 MiB           1   @profile
    16                                         def q2_time(file_path: str) -> List[Tuple[str, int]]:
    17    126.9 MiB      0.0 MiB           1       emoji_counter = Counter()
    18                                             
    19    126.9 MiB      0.0 MiB           2       with zipfile.ZipFile(file_path, 'r') as z:
    20    126.9 MiB      0.0 MiB           2           with z.open(z.namelist()[0]) as f:
    21    126.9 MiB      0.0 MiB      117408               for line in f:
    22    126.9 MiB      0.0 MiB      117407                   tweet = json.loads(line)
    23    126.9 MiB      0.0 MiB      117407                   emojis = extract_emojis(tweet.get('content', ''))
    24    126.9 MiB      0.0 MiB      117407                   emoji_counter.update(emojis)
    25                 

<pstats.Stats at 0x2033987b490>

In [20]:
result_q2_time

[('🙏', 7286),
 ('😂', 3072),
 ('🚜', 2972),
 ('✊', 2411),
 ('🌾', 2363),
 ('🏻', 2080),
 ('❤', 1779),
 ('🤣', 1668),
 ('🏽', 1218),
 ('👇', 1108)]

In [21]:
profiler = cProfile.Profile()
profiler.enable()

result_q2_memory =q2_memory.q2_memory(output)

profiler.disable()

stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()



Filename: C:\Users\nmendez\Documents\MAESTRIA\tweetsProcessor\src\q2_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    13    127.0 MiB    127.0 MiB           1   @profile
    14                                         def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    15    127.0 MiB      0.0 MiB           1       emoji_counter = Counter()
    16                                             
    17    127.2 MiB      0.0 MiB           2       with zipfile.ZipFile(file_path, 'r') as z:
    18    127.2 MiB      0.0 MiB           2           with z.open(z.namelist()[0]) as f:
    19    127.0 MiB      0.0 MiB           1               buffer = []
    20    127.2 MiB     -9.9 MiB      117408               for line in f:
    21    127.2 MiB     -9.8 MiB      117407                   tweet = json.loads(line)
    22    127.2 MiB     -9.9 MiB      117407                   buffer.append(tweet.get('content', ''))
    23                                            

<pstats.Stats at 0x2033a6ccf70>

In [22]:
result_q2_memory

[('🙏', 7286),
 ('😂', 3072),
 ('🚜', 2972),
 ('✊', 2411),
 ('🌾', 2363),
 ('🏻', 2080),
 ('❤', 1779),
 ('🤣', 1668),
 ('🏽', 1218),
 ('👇', 1108)]

---

# Q3

In [31]:
profiler = cProfile.Profile()
profiler.enable()

result_q3_memory =q3_memory.q3_memory(output)

profiler.disable()

stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()


Filename: C:\Users\nmendez\Documents\MAESTRIA\tweetsProcessor\src\q3_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    12    131.0 MiB    131.0 MiB           1   @profile
    13                                         def q3_memory(file_path: str) -> List[Tuple[str, int]]:
    14    131.0 MiB      0.0 MiB           1       mention_counter = Counter()
    15                                             
    16    131.4 MiB      0.0 MiB           2       with zipfile.ZipFile(file_path, 'r') as z:
    17    131.4 MiB      0.0 MiB           2           with z.open(z.namelist()[0]) as f:
    18    131.0 MiB      0.0 MiB           1               buffer = []
    19    131.4 MiB -33336.8 MiB      117408               for line in f:
    20    131.4 MiB -33336.7 MiB      117407                   tweet = json.loads(line)
    21    131.4 MiB -33337.0 MiB      117407                   buffer.append(tweet.get('content', ''))
    22                                          

<pstats.Stats at 0x2033aa42140>

In [29]:
profiler = cProfile.Profile()
profiler.enable()

result_q3_time =q3_time.q3_time(output)

profiler.disable()

stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()


Filename: C:\Users\nmendez\Documents\MAESTRIA\tweetsProcessor\src\q3_time.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    14    131.4 MiB    131.4 MiB           1   @profile
    15                                         def q3_time(file_path: str) -> List[Tuple[str, int]]:
    16    131.4 MiB      0.0 MiB           1       mention_counter = Counter()
    17                                             
    18    131.4 MiB     -0.4 MiB           2       with zipfile.ZipFile(file_path, 'r') as z:
    19    131.4 MiB     -0.4 MiB           2           with z.open(z.namelist()[0]) as f:
    20    131.4 MiB -89557.5 MiB      117408               for line in f:
    21    131.4 MiB -89557.0 MiB      117407                   tweet = json.loads(line)
    22    131.4 MiB -89557.0 MiB      117407                   mentions = extract_mentions(tweet.get('content', ''))
    23    131.4 MiB -89556.9 MiB      117407                   mention_counter.update(mentions)
    24       

<pstats.Stats at 0x20319a47c70>

In [32]:
result_q3_memory

[('@narendramodi', 2261),
 ('@Kisanektamorcha', 1836),
 ('@RakeshTikaitBKU', 1639),
 ('@PMOIndia', 1422),
 ('@RahulGandhi', 1125),
 ('@GretaThunberg', 1046),
 ('@RaviSinghKA', 1015),
 ('@rihanna', 972),
 ('@UNHumanRights', 962),
 ('@meenaharris', 925)]

In [33]:
result_q3_time

[('@narendramodi', 2261),
 ('@Kisanektamorcha', 1836),
 ('@RakeshTikaitBKU', 1639),
 ('@PMOIndia', 1422),
 ('@RahulGandhi', 1125),
 ('@GretaThunberg', 1046),
 ('@RaviSinghKA', 1015),
 ('@rihanna', 972),
 ('@UNHumanRights', 962),
 ('@meenaharris', 925)]