In [1]:
import os
import random
import json
import datetime


me = 'subject'
test_id = 'sample_user'

root = 'sample_data/'
target_dir = 'sample_training_data/'
training_data = 'sample_training_data/'


In [2]:
def ms2date(timestamp_ms):
    return datetime.datetime.fromtimestamp(timestamp_ms/1000.0)

def fix(text):
    text = text.encode('latin1').decode('utf8')
    return text
    
def showMessage(message):
    print(message['sender_name'],'--->', ms2date(message['timestamp_ms']))
    print(get_content(message))
    print()

def get_content(message):
    try:
        m = message['content']
        try:
            m = fix(m)
        except:
            ...
        return m
    except Exception as e:
        #print(e)
        #print(message['gifs'][0]['uri'].split('/')[-1])
        return '[gif]'


In [3]:
def split2conversations(messages, limit = 60, reverse=True): #limit in minute
    # minute to ms
    limit = limit * 60 * 1000
    if reverse:
        messages.reverse()
    conversations = []
    conversation = []
    for message in messages:
        if len(conversation) == 0:
            conversation.append(message)
        else:
            if abs(message['timestamp_ms'] - conversation[-1]['timestamp_ms']) < limit:
                conversation.append(message)
            else:
                conversations.append(conversation)
                conversation = [message]
    conversations.append(conversation)
    return conversations


def merge_message(conversation):
    starter = None
    temp = None
    result_json = []

    last_sender = starter
    for message in conversation:
        if message.get('sender_name') == last_sender:
            temp['content']+=get_content(message)+"\n "
        else:
            if temp is not None:
                result_json.append(temp)
            temp = {'sender_name': message.get('sender_name'), 'timestamp_ms': message.get('timestamp_ms'),'content': get_content(message)+'\n'}
        last_sender = message.get('sender_name')
    result_json.append(temp)
    return result_json


In [4]:
# LOAD CONVERSATIONS WITH TEST USER
f = open(os.path.join(root,'inbox', test_id, 'message_1.json'), 'r')
data = json.load(f)['messages']
conversations = split2conversations(data)


# LOAD ALL CONVERSATIONS
# conversations = []
# for partner in os.listdir(os.path.join(root,'inbox')):
#     f = open(os.path.join(root,'inbox', partner, 'message_1.json'), 'r')
#     data = json.load(f)['messages']
#     conversations+= split2conversations(data)


len(conversations) 


1

In [5]:
for m in conversations[0]:
    showMessage(m)

sample_user ---> 2021-12-02 17:36:40
Hello!

sample_user ---> 2021-12-02 17:53:20
What's up man?

sample_user ---> 2021-12-02 18:10:00
:)

subject ---> 2021-12-02 18:26:40
All good here, how about you?



In [6]:
merged_conversations = [merge_message(conversation) for conversation in conversations]

In [7]:
for m in merged_conversations[0]:
    showMessage(m)

sample_user ---> 2021-12-02 17:36:40
Hello!
What's up man?
 :)
 

subject ---> 2021-12-02 18:26:40
All good here, how about you?




In [8]:
def Conv2Prompt(conversation):
    prompt_dict = []
    for i,m in enumerate(conversation):
        sender = 'Agent' if m['sender_name'] == me else 'Partner'
        prompt_dict.append((sender+': '+get_content(m)+'\n').replace('\n \n','\n').replace('\n\n','\n'))
        if sender == 'Agent' and len(prompt_dict)>1:
            yield prompt_dict

def Prompt2Training(prompt_dict):
    prompt = ""
    for i, key in enumerate(prompt_dict):
        if i == len(prompt_dict)-1:
            break
        prompt += key
    completion = prompt_dict[-1].replace('Agent:','')
    data = {'prompt': prompt+'Agent:','completion': ' '+completion}
    return data

#TEST Prompt2Training
for d in Conv2Prompt(merged_conversations[0]):
    r = Prompt2Training(d)
    print(r)
    print()
    print()

{'prompt': "Partner: Hello!\nWhat's up man?\n :)\nAgent:", 'completion': '  All good here, how about you?\n'}




In [9]:
#FROMAT MESSAGES TO GPT3 TRAINING DATA FORMAT WITH CONVERSATION HISTORY
training_data = []
for conversation in merged_conversations:
    for d in  Conv2Prompt(conversation):
        training_data.append(Prompt2Training(d))
        
print(len(training_data))
random.shuffle(training_data)
training_json = json.dumps(training_data)

print(training_data[0])


1
{'prompt': "Partner: Hello!\nWhat's up man?\n :)\nAgent:", 'completion': '  All good here, how about you?\n'}


In [10]:
print(training_json[:1000]+"...")

#file_name = test_id.split('_')[0]
file_name = 'conversations'
# SAVE AS JSON
with open(os.path.join(target_dir,file_name+'.json'), "w") as outfile:
    outfile.write(training_json)

# SAVE AS JSONLINES
with open(os.path.join(target_dir,file_name+'.jsonl'), "w") as jsonl_output:
    for entry in training_data:
        json.dump(entry, jsonl_output)
        jsonl_output.write('\n')
        

[{"prompt": "Partner: Hello!\nWhat's up man?\n :)\nAgent:", "completion": "  All good here, how about you?\n"}]...
