In [1]:
import pandas as pd
import numpy as np
import json

# Load Data
### We load the data from the json (only required columns) and create a pandas dataframe

In [2]:
path = "archive/farmers-protest-tweets-2021-03-5.json"
tweets = []
with open(path, "r") as f:
    for l in f.readlines():
        t = json.loads(l)
        clean_t = {
            "url": t["url"],
            "date": t["date"][:10],
            "content": t["content"],
            "username": t["user"]["username"],
            "userid": t["user"]["id"],
            "retweetCount": t["retweetCount"]
        }
        tweets.append(clean_t)        

In [3]:
df = pd.DataFrame(tweets)

# 1. Top 10 most retweeted tweets

In [4]:
def most_retweeted(n=10):
    return df.sort_values("retweetCount", ascending=False).head(10)

# 2. Top 10 users by tweet quantity

In [5]:
def most_tweets_users(n=10):
    userid = df.groupby("userid").count().sort_values("url", ascending=False).index[0:n]
    return df[df["userid"].isin(userid)].groupby("username").count().sort_values("url", ascending=False)["url"]

# 3. Top 10 days by tweet quantity

In [6]:
def most_tweets_days(n=10):
    dates = df.groupby("date").count().sort_values("url", ascending=False)["url"]
    return dates.iloc[:n]

# 4. Top 10 hashtags used

In [27]:
s = "aab dasfjadsf lklklklklklklk"
s.find("a",20)

s = "123"
s[:-1]

'12'

In [42]:
from collections import defaultdict as dd
from time import sleep 

def look_for_hashtags(s):
    hs = []
    h = s.find("#")
    while h != -1:
        j = s.find(" ", h)
        k = s.find("\n", h)
        if k<j:
            j = k
        if j == -1:
            hs.append(s[h:])
            break
        else:
            hs.append(s[h:j])
        h = s.find("#", j+1)
    return hs 

def most_used_hashtags(n=10):
    hts = dd(int)
    
    for t in tweets:
        for ht in look_for_hashtags(t["content"]):
            hts[ht] += 1
    
    top = sorted(hts.items(), key= lambda x:x[1], reverse=True)[:n]
    return top

# Main Function

In [7]:
def main(fun=None, n=10):
    """You should choose what function to run"""
    if not fun:
        print(("You have to choose what function to call:" 
         "- most_retweeted"))
    if fun == "most_retweeted":
        return most_retweeted(n)
    if fun == "most_tweets_users":
        return most_tweets_users(n)
    if fun == "most_tweets_days":
        return most_tweets_days(n)
    if fun == "most_used_hashtags":
        return most_used_hashtags(n)

In [43]:
main("most_retweeted")
main("most_tweets_users")
main("most_tweets_days")
main("most_used_hashtags")