# Using this notebook
1. Download an archive from the Meta Account Center (JSON format only)
2. Find the conversation you want to analyze
3. Put all .json files in the ```data/``` folder (should be a collection of files message_1.json, message_2.json, etc.)
4. Run the notebook

### Loading


In [None]:
import json
import glob
import pandas as pd
import plotly.express as px

In [None]:
# load all files in 'data' folder that follow pattern 'message_*.json'
file_pattern = 'data/message_*.json'
files = glob.glob(file_pattern)
print(files)

dataframes = []
for file in files:
    with open(file, 'r') as f:
        data = json.load(f)
        df = pd.DataFrame(data['messages']) # only load messages_df
        dataframes.append(df)

messages_df = pd.concat(dataframes, ignore_index=True)

messages_df.head()

### Cleaning
- fix encoding
- parse timestamp into datetime object
- parse reactions
- add reaction count to each message

In [None]:
# fix encoding
def correct_encoding(text):
    try:
        return text.encode('iso-8859-1').decode('utf-8')
    except Exception as e:
        # Return original text in case of any error
        return text

# apply the correct function encoding to all columns, iterate over reactions
for col in messages_df.columns:
    if col == "reactions":
        for reactions in messages_df[col].dropna():
            for reaction in reactions:
                reaction['reaction'] = correct_encoding(reaction['reaction'])
                reaction['actor'] = correct_encoding(reaction['actor'])
    else:
        messages_df[col] = messages_df[col].apply(correct_encoding)

In [None]:
# parse timestamp into datetime column and sort by datetime
messages_df['datetime'] = pd.to_datetime(messages_df['timestamp_ms'], unit='ms')
messages_df = messages_df.sort_values(by='datetime', ascending=False)

In [None]:
messages_df['reaction_count'] = 0

# parse JSON reactions column
if 'reactions' in messages_df.columns:
    messages_df['reactions'] = messages_df['reactions'].apply(lambda x: x if isinstance(x, list) else [])

# then apply length function to the reaction column (JSON array)
messages_df['reaction_count'] = messages_df['reactions'].apply(len)


# Simple analysis

### Message distribution by sender

In [None]:

sender_sum = messages_df['sender_name'].value_counts().reset_index()
sender_sum.columns = ['sender_name', 'count']

# Create an interactive bar plot
fig = px.bar(sender_sum,
             x='sender_name',
             y='count',
             labels={'sender_name': 'Sender Name', 'count': 'Message Count'},
             title='Distribution of Messages by Sender')

# Show the plot
fig.show()


### Hourly distribution of messages

In [None]:
# Ensure 'hour' column is updated with the hour extracted from 'datetime'
messages_df['hour'] = messages_df['datetime'].dt.hour

# Group the data by hour and sender name, then count the number of messages
hourly_distribution = messages_df.groupby('hour').size().reset_index(name='message_count')

# Create an interactive bar plot
fig = px.bar(hourly_distribution,
             x='hour',
             y='message_count',
             labels={'hour': 'Hour of Day', 'message_count': 'Message Count'},
             title='Hourly Distribution of Messages')

# Show the plot
fig.show()


### Show significant senders

In [None]:

# Calculate the total number of messages
total_messages = len(messages_df)

# Calculate the number of messages per sender
sender_counts = messages_df['sender_name'].value_counts().reset_index()
sender_counts.columns = ['sender_name', 'count']

# Calculate the percentage of total messages for each sender
sender_counts['percentage'] = (sender_counts['count'] / total_messages) * 100

# Filter to keep only senders with more than 1% of total messages
significant_senders = sender_counts[sender_counts['percentage'] > 0.01]

# Display the filtered DataFrame
print(significant_senders)