In [27]:
import altair as alt
import pandas as pd
import os

## 1. Creating the final dataset with merged metadata

In [21]:
annotated_df = pd.read_csv('data/annotated/annotated_dataset.csv')
videos_df = pd.read_csv('data/transcriptions/youtube_videos_transcribed.csv')
videos_df = videos_df[['video_url', 'video_title', 'publish_date']]
final_df = df_merge = pd.merge(annotated_df, videos_df, on='video_url', how='left')

In [22]:
final_df.columns

Index(['video_url', 'brazilian_state', 'corrected_transcription',
       'text_origin', 'specific_contexts', 'punchlines', 'joke_explanation',
       'fun', 'humor', 'nonsense', 'wit', 'irony', 'satire', 'sarcasm',
       'cynicism', 'video_title', 'publish_date'],
      dtype='object')

In [23]:
new_columns_order = ['video_url', 'video_title', 'publish_date', 'brazilian_state', 'text_origin',
        'corrected_transcription', 'specific_contexts', 'punchlines',
       'fun', 'humor', 'nonsense', 'wit', 'irony', 'satire', 'sarcasm',
       'cynicism', 'joke_explanation']
final_df = final_df[new_columns_order]

In [24]:
os.makedirs('data/completed', exist_ok=True)
final_df.to_csv('data/completed/brazilian_ne_humorous_texts.csv', index=False)

## 2. Exploring data per state

In [30]:
state_counts = final_df['brazilian_state'].value_counts().reset_index()
state_counts.columns = ['Brazilian State', 'Count']

chart = alt.Chart(state_counts).mark_bar().encode(
    y=alt.Y('Brazilian State:N', sort='-x', title='Brazilian State'),
    x=alt.X('Count:Q', title='Number of Texts'),
    tooltip=['Brazilian State:N', 'Count:Q']
).properties(
    title='Number of Texts per Brazilian State',
    width=600,
    height=400
)

chart.show()

### 3. Exploring publish dates