In [17]:
import json
import os, sys
from shutil import rmtree

In [247]:
DATADIR = './data/facebook_conversations/messages'
FILENAME = 'message.json'
OUTPUT_FILENAME = './data/facebook_conversations/all_facebook.json'

UNWANTED_CATEGORIES = set(['photos', 'files', 'videos', 'gifs', 'audio_files', 'plan', 'videos'])


In [25]:
# cleaning up unwanted files
def clean_up(root_dir):
    for dir in os.listdir(root_dir):

        conv_dir = os.path.join(root_dir, dir)
        files_list = os.listdir(conv_dir)

        # to delete directories containing images, photos, ... (i.e. not message.json files)
        for file in files_list:
            if file != 'message.json':
                rmtree(os.path.join(conv_dir, file))

        # to delete empty directories not containing a message.json file
        if not files_list:
            os.rmdir(conv_dir)



## 'message.json' file structure

Files are composed of 5 or 6 categories (mainly 6) :

* **participants** : List of participants as `{'name': Name}`
* **messages** : List of messages. See later.
* **is_still_participant** : Bool. If am still in conversation
* **thread_type** : *Regular* (friends), *RegularGroup* (Group Chat) or Pending (*not friends*)
* **thread_path** : file in which the `message.json` is found
* **title** (Optional) : title of conversation (or name of participants)

#### Messages

* **type** Type of message
    - Generic (*only one we keep*)
    - Call
    - Subscribe
    - Unsubscribe (leave group)
    - Share (send an attachment)
    - Plan (evenement?)
* **timestamp_ms**
* **sender_name** : *important pour savoir qui envoie les messages!!!!*
* **content** : *partie importante. contient le message*
* **reactions** (Optional) : reactions to a message. Presented as a list of {'actor': Actor, 'reaction' : Reaction}
* **stickers** (Optional) : Contains emoji. Filter stickers... Can discard for now...

WE DISCARD MESSAGES IF THEY CONTAIN ANY OF THOSE THINGS.

May be possible to parse to see if the messages not having the `Name sent a photo` pattern are useful...

* **photos**
* **gifs**
* **audio_files**
* **plan**
* **videos**
* **files**


In [209]:
def show_file_structure(json_file):
    
    with open(json_file) as file:
        conversation = json.load(file)
        
#     print(json.dumps(conversation, indent=4, sort_keys=True))

    messages = conversation['messages']
    
    for message in messages:
        
        if message['type'] == 'Generic' and 'content' in message and chosen_categ in message:
            print(json.dumps(message, indent=4, sort_keys=True))

In [240]:
# parsing a message.json file

def parse_conversation(json_file, all_conversations):
    
    with open(json_file) as file:
        conversation = json.load(file)
        
    messages = conversation['messages']
    
    for message in messages:
        if message_is_valid(message):
            
            all_conversations.append({'content': message['content'],
                                      'sender_name': message['sender_name'],
                                      'timestamp_ms': message['timestamp_ms']})
        


In [241]:
def message_is_valid(message):
    
    # if message contains unwanted category we discard
    if UNWANTED_CATEGORIES.intersection(set([categ for categ in message])):
        return False

    # weird case
    elif 'content' not in message or 'sender_name' not in message:
        return False
    
    return True

In [248]:
all_conversations = []


for i, dir in enumerate(os.listdir(DATADIR)):
    
    conv_dir = os.path.join(DATADIR, dir)
    conv_file = os.path.join(conv_dir, FILENAME)

    parse_conversation(conv_file, all_conversations)
    
with open(OUTPUT_FILENAME, 'w') as fout:
    json.dump(all_conversations, fout)
    

In [None]:
### Preprocess the facebook data