In [None]:
%load_ext autoreload
%autoreload 2
# default_exp importers.whatsapp

In [None]:
from hashlib import sha256
from integrators.data.schema import *
from integrators.imports import *
from integrators.importers.importer import ImporterBase
from integrators.pod.client import PodClient
import time

# WhatsApp importer

This importer fetches your WhatsApp data such as contacts, chats and messages (including media files) from the WhatsApp server via a toolset of Matrix homeserver+ service bridges. It contains two major components: (1) MatrixClient for communication with the Matrix homeserver via its Client-Server API, and (2) WhatsAppImporter for handling the logic of fetching data from WhatsApp server and creating data items/edges to be uploaded to the Pod. 

To 

In [None]:
class MatrixClient:
    def __init__(self, url, username, token):
        self.url = url
        self.username = username
        self.token = token
        
    def get_joined_rooms(self):
        try:
            result = requests.get(f"{self.url}/_matrix/client/r0/joined_rooms?access_token={self.token}")
            if result.status_code != 200:
                print(result, result.content)
                return False
            else:
                json = result.json()
                res = json["joined_rooms"]
                return res
        except requests.exceptions.RequestException as e:
            print(e)
            return None    
    
    def get_joined_members(self, room_id):
        try:
            result = requests.get(f"{self.url}/_matrix/client/r0/rooms/{room_id}/joined_members?access_token={self.token}")
            if result.status_code != 200:
                print(result, result.content)
                return False
            else:
                json = result.json()
                res = json["joined"]
                return res
        except requests.exceptions.RequestException as e:
            print(e)
            return None          
        
    def send_messages(self, room_id, body):
        try:
            result = requests.post(f"{self.url}/_matrix/client/r0/rooms/{room_id}/send/m.room.message?access_token={self.token}", json=body)
            if result.status_code != 200:
                print(result, result.content)
                return None
            else:
                json = result.json()
                event_id = json["event_id"]
                return event_id
        except requests.exceptions.RequestException as e:
            print(e)
            return None
            
    def get_event_context(self, room_id, event_id):
        try:
            result = requests.get(f"{self.url}/_matrix/client/r0/rooms/{room_id}/context/{event_id}?limit=1&access_token={self.token}")
            if result.status_code != 200:
                print(result, result.content)
                return None
            else:
                json = result.json()
                res = json["events_after"]
                return res
        except requests.exceptions.RequestException as e:
            print(e)
            return None    
        
    def sync_events(self, next_batch):
        try:
            result = requests.get(f"{self.url}/_matrix/client/r0/sync?since={next_batch}&access_token={self.token}")
            if result.status_code != 200:
                print(result, result.content)
                return None
            else:
                json = result.json()
                return json
        except requests.exceptions.RequestException as e:
            print(e)
            return None
        
    def get_profile(self, user_id):
        try:
            result = requests.get(f"{self.url}/_matrix/client/r0/profile/{user_id}")
            if result.status_code != 200:
                print(result, result.content)
                return None
            else:
                json = result.json()
                return json
        except requests.exceptions.RequestException as e:
            print(e)
            return None
        
    def get_room_state(self, room_id):
        try:
            result = requests.get(f"{self.url}/_matrix/client/r0/rooms/{room_id}/state?access_token={self.token}")
            if result.status_code != 200:
                print(result, result.content)
                return None
            else:
                json = result.json()
                return json
        except requests.exceptions.RequestException as e:
            print(e)
            return None
        
    def download_file(self, uri):
        try:
            result = requests.get(f"{self.url}/_matrix/media/r0/download/{HOSTNAME}/{uri}")
            if result.status_code != 200:
                print(result, result.content)
                return None
            else:
                file = result.content
                return file
        except requests.exceptions.RequestException as e:
            print(e)
            return None
    

In [None]:
def get_g_attr(item, name, data_type, default_value=None):
    # hide
    first_or_default = next((att for att in item.genericAttribute if att.name == name), None)
    if first_or_default == None:
        return default_value
    else:
        if data_type == 'int':
            return first_or_default.intValue
        elif data_type == 'bool':
            return first_or_default.boolValue
        elif data_type == 'float':
            return first_or_default.floatValue
        elif data_type == 'string':
            return first_or_default.stringValue
        elif data_type == 'datetime':
            return first_or_default.stringValue
        else:
            raise Exception(f"datatype {data_type} is not supported")
               

In [None]:
class WhatsAppImporter(ImporterBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.matrix_client = None
        self.hostname = None
        self.matrix_address = None
        self.prefix_service = None
        self.bot_name = None
        self.username = None
        self.token = None
        self.acc_idx = {}
        self.msgchan_idx = {}
        self.msg_idx = {}
        
    def set_matrix_client(self, importer_run):
        self.hostname = get_g_attr(importer_run, 'host', 'string', None)
        self.matrix_address = get_g_attr(importer_run, 'address', 'string', None)
        self.prefix_service = get_g_attr(importer_run, 'prefix', 'string', None)
        self.bot_name = get_g_attr(importer_run, 'bot', 'string', None)
        self.username = importer_run.username
        self.token = importer_run.password
        
        assert self.hostname is not None 
        assert self.matrix_address is not None 
        assert self.prefix_service is not None 
        assert self.bot_name is not None
        
        self.matrix_client = MatrixClient(self.matrix_address, self.username, self.token)
        
    def get_joined_rooms(self):
        joined_rooms = self.matrix_client.get_joined_rooms()
        return joined_rooms
    
    def get_joined_members(self, room):
        joined_members = self.matrix_client.get_joined_members(room)
        joined_members = list(joined_members.keys())
        return joined_members
    
    def get_receivers(self, room):
        joined_members = self.get_joined_members(room)
        joined_members.remove(self.username)
        receivers = [self.acc_idx[m] for m in joined_members]
        return receivers
    
    def get_bot_room_id(self, joined_rooms):
        for room in joined_rooms:
            joined_members = self.matrix_client.get_joined_members(room)
            if len(joined_members) == 2 and self.bot_name in joined_members:
                return room
        
    def bot_list_contacts(self, room_id):
        body = {"msgtype":"m.text", "body":"list contacts"}
        event_id = self.matrix_client.send_messages(room_id, body)
        return event_id

    def get_contacts(self, room_id, event_id):
        contact_list = self.matrix_client.get_event_context(room_id, event_id)
        contacts = contact_list[0]["content"]["body"].split("\n")
        return contacts
    
    def sync_events(self, next_batch):
        events = self.matrix_client.sync_events(next_batch)
        return events
    
    def get_room_events(self, events):
        room_events = events["timeline"]["events"]
        return room_events
    
    def get_phone_number(self, contact):
        if not contact.startswith(("#", "* /")):
            parts = contact.split(' - ')
            if len(parts) >= 2:
                phone_number = parts[1][1:-1]
                return phone_number       
            
    def create_account(self, user_id):
        profile = self.matrix_client.get_profile(user_id)
        avatar_url = None
        if "avatar_url" in profile:
            avatar_url = profile["avatar_url"]
        account = Account(externalId=user_id, displayName=profile["displayname"], avatarUrl=avatar_url, service="whatsapp")
        return account

    def create_message_channel(self, room_id, member_accounts):
        room_state = self.matrix_client.get_room_state(room_id)
        room_name = None
        room_topic = None
        for s in room_state:
            if s["type"] == "m.room.name":
                room_name = s["content"]["name"]
            if s["type"] == "m.room.topic":
                room_topic == s["content"]["topic"]
        message_channel = MessageChannel(externalId=room_id, name=room_name, topic=room_topic)
        for m in member_accounts:
            message_channel.add_edge("receiver", m)
        return message_channel
    
    def create_message(self, event, room):         
        message = Message(externalId=event["event_id"], importJson=event["content"], service="whatsapp")
        self.msg_idx[event["event_id"]] = message
        message.add_edge("messageChannel", self.msgchan_idx[room])
        message.add_edge("sender", self.acc_idx[event["sender"]])
        
#         if "m.relates_to" in event["content"]:
#             message.add_edge("replyTo", self.msg_idx[event["content"]["m.relates_to"]["m.in_reply_to"]["event_id"]]) 

#         if "info" in event["content"]:
#             media = self.create_media(event["content"])
#             pod_client.create(media)

#             if event["content"]["msgtype"] == "m.video":
#                 message.add_edge("video", media)
#             elif event["content"]["msgtype"] == "m.image":
#                 message.add_edge("photo", media)
#             elif event["content"]["msgtype"] == "m.audio":
#                 message.add_edge("audio", media)
#             elif event["content"]["msgtype"] == "m.file":
#                 message.add_edge("document", media)
        return message
    
    def create_media(self, content):
        uri = content["url"].split('/')[3]
        binaries = self.matrix_client.download_file(uri)
        sha_file = sha256(binaries).hexdigest()
        file = File(externalId=content["body"], sha256=sha_file)
        pod_client.create(file)
        # TODO: upload file

        if content["msgtype"] == "m.image":
            photo = Photo(externalId=content["url"])
            photo.add_edge("file", file)
            return photo
        elif content["msgtype"] == "m.video":
            video = Video(externalId=content["url"], duration=content["info"]["duration"])
            video.add_edge("file", file)
            return video
        elif content["msgtype"] == "m.audio":
            audio = Audio(externalId=content["url"], duration=content["info"]["duration"])
            audio.add_edge("file", file)
            return audio
        elif content["msgtype"] == "m.file":
            document = Document(externalId=content["url"], size=content["info"]["size"])
            document.add_edge("file", file)
            return document
        
    def get_all_accounts(self, all_rooms):
        room_id = self.get_bot_room_id(all_rooms)
        event_id = self.bot_list_contacts(room_id)
        time.sleep(1)
        contacts = self.get_contacts(room_id, event_id)
        numbers = [self.get_phone_number(c) for c in contacts]
        numbers = [x for x in numbers if x is not None]
        users = [f"{self.prefix_service}{n}:{self.hostname}" for n in numbers]
        
        for r in all_rooms:
            joined_members = self.get_joined_members(r)
            for m in joined_members:
                if not m in users:
                    users.append(m)
        accounts = [self.create_account(n) for n in users]
        return accounts
    
    def set_account_index(self, accounts):
        for a in accounts:
            self.acc_idx[a.externalId] = a
    
    def get_all_messagechannels(self, all_rooms):
        msg_chans = []
        for r in all_rooms:
            member_accounts = self.get_receivers(r)
            message_channel = self.create_message_channel(r, member_accounts)
            pod_client.create(message_channel)
            msg_chans.append(message_channel)
        return msg_chans          

    def set_msg_chan_index(self, message_channels):
        for c in message_channels:
            self.msgchan_idx[c.externalId] = c
    
    def get_all_messages(self):
        batch = "s9_7_0_1_1_1"
        sync_events = self.sync_events(batch)
        joined_rooms = sync_events["rooms"]["join"]
        msgs = []
        for r in joined_rooms:
            room_events = self.get_room_events(sync_events["rooms"]["join"][r])
            for e in room_events:
                message = self.create_message(e, r)
                pod_client.create(message)
                msgs.append(message)
        return msgs
        
        
    def run(self, importer_run, pod_client=None, verbose=True):
        self.set_matrix_client(importer_run)
        self.update_run_status(pod_client, importer_run, "running")
        
        all_rooms = self.get_joined_rooms()
        
        self.update_progress_message(pod_client, importer_run, "importing contacts", verbose=verbose)
        accounts = self.get_all_accounts(all_rooms)
        for a in accounts: pod_client.create(a)
        self.set_account_index(accounts) 
        
        self.update_progress_message(pod_client, importer_run, "importing chats", verbose=verbose)
        message_channels = self.get_all_messagechannels(all_rooms)
        for m in message_channels: pod_client.create_edges(m.get_all_edges())
        self.set_msg_chan_index(message_channels)
          
        self.update_progress_message(pod_client, importer_run, "importing messages", verbose=verbose)
        messages = self.get_all_messages()
        for msg in messages: pod_client.create_edges(msg.get_all_edges())
        
        print(f"Finished running {self}")
        
        self.update_run_status(pod_client, importer_run, "done")

In [None]:
DEFAULT_MATRIX_ADDRESS = "http://localhost:8008"
HOSTNAME = "synapse"
USERNAME = "foo"
PREFIX_SERVICE = "@whatsapp_"
MATRIX_USERNAME = f"@{USERNAME}:{HOSTNAME}"
MATRIX_TOKEN = "MDAxNWxvY2F0aW9uIHN5bmFwc2UKMDAxM2lkZW50aWZpZXIga2V5CjAwMTBjaWQgZ2VuID0gMQowMDFmY2lkIHVzZXJfaWQgPSBAZm9vOnN5bmFwc2UKMDAxNmNpZCB0eXBlID0gYWNjZXNzCjAwMjFjaWQgbm9uY2UgPSAmQHZkbkV2aEdGO0Jsb1NzCjAwMmZzaWduYXR1cmUgAzxgSUYL8xLSwUpbPa3-bHpCD8GnI5mkAVzbOOJufjQK"
BOT_NAME = f"@whatsappbot:{HOSTNAME}"
DB_KEY = "2DD29CA851E7B56E4697B0E1F08507293D761A05CE4D1B628663F411A8086D99"
OWNER_KEY = "2DD29CA851E7B56E4697B0E1F08507293D761A05CE4D1B628663F411A8086D99"

pod_client = PodClient(database_key=DB_KEY, owner_key=OWNER_KEY)
pod_client.delete_all()
whatsapp_importer = WhatsAppImporter()

importer_run = ImporterRun(progress=0, username=MATRIX_USERNAME, password=MATRIX_TOKEN)
importer_run.add_edge('genericAttribute', GenericAttribute(name='host', stringValue=HOSTNAME))
importer_run.add_edge('genericAttribute', GenericAttribute(name='address', stringValue=DEFAULT_MATRIX_ADDRESS))
importer_run.add_edge('genericAttribute', GenericAttribute(name='prefix', stringValue=PREFIX_SERVICE))
importer_run.add_edge('genericAttribute', GenericAttribute(name='bot', stringValue=BOT_NAME))
importer_run.add_edge('importer', whatsapp_importer)
pod_client.create(importer_run)

whatsapp_importer.run(importer_run=importer_run, pod_client=pod_client)
pod_client.delete_all()

RUN STATUS: running
PROGRESS MESSAGE: importing contacts
PROGRESS MESSAGE: importing chats
PROGRESS MESSAGE: importing messages
Finished running WhatsAppImporter (#None)
RUN STATUS: done
