In [1]:
import json
import pandas as pd

### Raw JSON Dict

In [2]:
with open('chessbuds_messages.json') as raw_data:
    chess_buds = json.load(raw_data)

In [3]:
# see what the top level keys are
chess_buds.keys()

dict_keys(['participants', 'messages', 'title', 'is_still_participant', 'thread_type', 'thread_path', 'magic_words', 'joinable_mode'])

### Helper Functions (Style)

In [4]:
# method to style passed-in dataframe table-cells (td)
# chaining together style attributes makes it so the dataframe does not continuously switch object types
def styleTable(df, caption):
    styled_df = df.style.set_properties(**{'text-align':'left', 'overflow-wrap':'break-word'})\
        .set_table_styles([dict(selector = 'th', props=[('text-align','left')])])\
        .set_caption(caption)
    return styled_df

### Participant DataFrame

In [5]:
# parse participant data
participants = chess_buds['participants']
title = chess_buds['title']

# create dataframe
df_part = pd.DataFrame(participants)

# pass dataframe to styling / caption method
df_part = styleTable(df_part, title + ': Participants')

#display dataframes
display(df_part)

Unnamed: 0,name
0,Scott Pence
1,Chad Larson
2,Joanna Rusch
3,Angela Babbitt Pence
4,David Silva
5,Aaron Rusch
6,Timothy Vanderpool


### Messages DataFrame

In [6]:
# parse message data
messages = chess_buds['messages']

# create raw dataframe to preserve data
df_mess_raw = pd.DataFrame(messages)

# duplicate the raw data to a new editable dataframe
df_mess = df_mess_raw

# rename a few columns that are staying
df_mess = df_mess.rename(columns={"sender_name":"sender", "is_unsent":"is_draft", "is_taken_down":"is_removed"})

#### Timestamp Formatting

In [7]:
# create holder arrays
date_col = []
time_col = []

# loop through timestamp column data
for i in df_mess_raw['timestamp_ms']:
    # convert to timestamp
    ts = pd.to_datetime(i, unit='ms')
    
    # display as formatted string and add to holder arrays
    date_col.append(ts.strftime('%m/%d/%Y'))
    time_col.append(ts.strftime('%I:%M%p'))

# delete the unformatted timestamp column from the dupe datatable
del df_mess['timestamp_ms']

# insert the formatted time and date columns into the dupe datatable
df_mess.insert(0, 'date', date_col)
df_mess.insert(1, 'time', time_col)

#### Reaction Data Formatting

In [8]:
# create holder array
react_col = []

# loop through reaction column data
for j in df_mess_raw['reactions']:
    if type(j) == list:
        react_col.append(len(j))
    else:
        react_col.append(0)

# delete the unformatted reaction column from the dupe datatable
del df_mess['reactions']

# insert the formatted reaction count column into the dupe datatable
df_mess.insert(6, 'reaction_count', react_col)

#### Bumped Message Data Formatting

In [9]:
# create holder array
bump_col = []

# loop through bumped_message_metadata column data
for k in df_mess_raw['bumped_message_metadata']:
    bump_col.append(k['is_bumped'])

# delete the unformatted bumped_message_metadata column from the dupe datatable
del df_mess['bumped_message_metadata']

# insert the is_bumped column into the dupe datatable
df_mess.insert(8, 'is_bumped', bump_col)

#### Share Link Data Parsing

In [10]:
# create holder array
share_col = []

# loop through share column data
for l in df_mess_raw['share']:
    if type(l) == dict:
        share_col.append(True)
    else:
        share_col.append(False)

# delete the unformatted share column from the dupe datatable
del df_mess['share']

# insert the share_link column into the dupe datatable
df_mess.insert(9, 'has_link', share_col)

#### Photos Data Parsing

In [11]:
# create holder array
photo_col = []

# loop through photos column data
for m in df_mess_raw['photos']:
    if type(m) == list:
        photo_col.append(True)
    else:
        photo_col.append(False)

# delete the photo column from the dupe datatable
del df_mess['photos']

# insert the photo link column into the dupe datatable
df_mess.insert(10, 'has_photo', photo_col)

#### Gif Data Parsing

In [12]:
# create holder array
gif_col = []

# loop through gifs column data
for n in df_mess_raw['gifs']:
    if type(n) == list:
        gif_col.append(True)
    else:
        gif_col.append(False)

# delete the photo column from the dupe datatable
del df_mess['gifs']

# insert the photo link column into the dupe datatable
df_mess.insert(10, 'has_gifs', gif_col)

#### User Data Parsing

In [13]:
# create holder array
user_col = []

# loop through photos column data
for o in df_mess_raw['users']:
    if type(o) == list:
        user_col.append(o[0]['name'])
    else:
        user_col.append('--')

# reassign user column values
df_mess['users'] = user_col

### DataFrame Styling & Display

In [14]:
# pass dataframe to styling / caption method
df_mess = styleTable(df_mess, title + ': Messages')

#display dataframes
display(df_mess)

Unnamed: 0,date,time,sender,content,type,is_draft,reaction_count,is_removed,is_bumped,has_link,has_gifs,has_photo,users
0,10/21/2022,05:55PM,Joanna Rusch,"Maybe he just wants to ride the publicity for a bit longer, even if he doesn't get any money from the lawsuit. Like, I didn't know his name before this but I certainly do now.",Generic,False,2,False,False,False,False,False,--
1,10/21/2022,05:30PM,Chad Larson,"To be fair to Hans....no one wants to be associated with an ""anal bead"" theory.",Generic,False,2,False,False,False,False,False,--
2,10/21/2022,05:26PM,Chad Larson,He would have to prove he didn't cheat and that they knew he didn't cheat....and made it all up. That's not provable.,Generic,False,2,False,False,False,False,False,--
3,10/21/2022,05:26PM,Scott Pence,"Yeah, no way. You over shoot and hope to get a portion.",Generic,False,2,False,False,False,False,False,--
4,10/21/2022,05:25PM,Chad Larson,"From what I see, I don't think he could win. But he is probably hoping for a good sized settlement",Generic,False,0,False,False,False,False,False,--
5,10/21/2022,05:19PM,Scott Pence,https://youtu.be/EDvK6i86EZ0,Share,False,2,False,False,True,False,False,--
6,10/21/2022,02:27AM,Chad Larson,Hans is suing Magnus and Hikaru for $100 million dollars hahaha,Generic,False,2,False,False,False,False,False,--
7,10/19/2022,04:34PM,Scott Pence,My first great move!,Generic,False,0,False,False,False,False,False,--
8,10/19/2022,04:33PM,Scott Pence,,Generic,False,4,False,False,False,False,True,--
9,10/18/2022,12:19AM,Scott Pence,,Generic,False,4,False,False,False,False,True,--


---

# Observations and Assignment Details

**Messages Table**
- The object for this table (or observation) are messages with relation to date and time
- Each row consists of message properties (e.g., sender, content, image files, reaction count)
- Each column has been drilled down to a single, relevant variable

This dataframe meets tiny data principles by only having a single observation type per table (messages over date and time) - It has single variables for each column (and they do not go deeper than a single value - And it has a single observation per row - meaning we are not comparing multiple things per row.

**Alternatives**
Some alternatives to this table might have been:
- Sort this same list by sender
- Shift to view all message data (e.g., count, total reactions, total shared images, etc.) per sender
- Messages by date

**Visualization Ideas**
I think if this were to have been arranged by sender it might have been easier to drill to a macro level, but for this single one it could be interesting to plot message counts by date or perhaps by time.  You could also introduce senders specific data (e.g., what time of day / which days do certain people post more/less).