# Granma Statistics


In [None]:
from pathlib import Path
import pandas as pd
import json
from matplotlib import pyplot as plt
from typing import List, Dict
import numpy as np

DATA_PATH = Path("granma", "data", "letters")


In [None]:

data_df = {
    "title": [],
    "year": [],
    "month": [],
    "day": [],
    "link": [],
    "body": [],
    "original_letter_link": [],
    "is_response": [],
    "comments_amount": [],
}

for date_dir in DATA_PATH.iterdir():
    if date_dir.is_dir():
        for letter_dir in date_dir.iterdir():
            if letter_dir.is_dir():
                print(f"WARNING: Letter {letter_dir} is a directory")
                continue
            letter_json = json.loads(letter_dir.read_text())
            year, month, day = letter_json["date"].split("-")
            year, month, day = int(year), int(month), int(day)
            title = letter_json["title"]
            body = letter_json["body"] 
            link = letter_json["link"]
            original_letter_link = letter_json["original_letter_link"]
            is_response = letter_json["is_response"]
            comments = letter_json["comments"]
            
            data_df["title"].append(title)
            data_df["year"].append(year)
            data_df["month"].append(month)
            data_df["day"].append(day)
            data_df["body"].append(body)
            data_df["link"].append(link)
            data_df["original_letter_link"].append(original_letter_link)
            data_df["is_response"].append(is_response)
            data_df["comments_amount"].append(len(comments))
        
data_df = pd.DataFrame(data_df)

In [None]:
def plot_stacked_bar(labels: List[str], values: Dict[str, List[int]], title: str):
    width = 0.35       # the width of the bars: can also be len(x) sequence

    fig, ax = plt.subplots()

    bottom = np.zeros(len(next(iter(values.values()))))
    
    for name, value in values.items():
        ax.bar(labels, value, width, label=name, bottom=bottom)
        bottom = bottom + np.array(value)

    ax.set_title(title)
    plt.xticks(labels)
    ax.legend()
    
    plt.savefig(title.replace(" ", "_") + ".png")

    plt.show()

In [None]:

years = data_df.groupby(by="year").count()["link"]
response_years = data_df[data_df['is_response']].groupby(by="year").count()["link"]
not_response_years = data_df[~data_df['is_response']].groupby(by="year").count()["link"]

values = {
    'respuestas': [0] + list(response_years), # 2013 is 0
    'normales': not_response_years,
}

plot_stacked_bar(years.index, values, "Cartas por año")


In [None]:
response_letters = data_df[data_df['is_response']]['link'].drop_duplicates().count()
responded_letters = data_df['original_letter_link'].drop_duplicates().count()
letters = data_df['link'].drop_duplicates().count()

print("Cartas de respuesta", response_letters)
print("Cartas respondidas", responded_letters)
print("Total de cartas", letters)


In [None]:
from nltk import word_tokenize
import string

# body_len = data_df["body"].map(lambda x: len(x.split()))
body_len = data_df["body"].map(lambda x: 0 if x in string.punctuation else len(word_tokenize(x)))
bins = sorted(set(body_len))
body_len.hist(bins=len(bins)//5)
plt.title("Cantidad de palabras")
plt.savefig("Cantidad_de_palabras.png")
plt.show()
print(body_len.describe())
print(body_len.sum())


In [None]:
bins = sorted(set(data_df["comments_amount"]))
data_df["comments_amount"].hist(bins=len(bins))
plt.title("Cantidad de comentarios")
plt.xticks(range(15))
plt.savefig("Cantidad_de_comentarios.png")
plt.show()
print(data_df["comments_amount"].describe())
print(data_df["comments_amount"][data_df["comments_amount"] > 0].count())
print(data_df["comments_amount"][data_df["comments_amount"] == 0].count())

