In [42]:
import torch
import pandas as pd
from pathlib import Path
from typing import Dict
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead

In [27]:
%%capture
!python prepare_messages.py --tg-history-path "data/result.json" --output-path "data/data.csv"

In [28]:
DATA_PATH = "data/data.csv"

data = load_dataset("csv", data_files=DATA_PATH, split="train")

Downloading and preparing dataset csv/default to /home/consent-flower/.cache/huggingface/datasets/csv/default-fb59fe54b251cb45/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /home/consent-flower/.cache/huggingface/datasets/csv/default-fb59fe54b251cb45/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


In [29]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 121829
})


In [34]:
data = data.filter(lambda example: example["context_1"] != None)

Filter:   0%|          | 0/121829 [00:00<?, ? examples/s]

In [35]:
print(data)

Dataset({
    features: ['context_3', 'context_2', 'context_1', 'response'],
    num_rows: 79105
})


In [41]:
data[0]

{'context_3': None,
 'context_2': None,
 'context_1': 'норм цвет)',
 'response': 'Грустно видеть Диану с ником куратора\nто есть ты больше не наш куратор?(('}

In [43]:
data = data.train_test_split(test_size=0.2, shuffle=True)
data['train'][0:1]

{'context_3': [None],
 'context_2': [None],
 'context_1': ['берешь хакатон'],
 'response': ['Хакатоны эт другое']}

In [44]:
data['train'][0:1]

{'context_3': [None],
 'context_2': [None],
 'context_1': ['берешь хакатон'],
 'response': ['Хакатоны эт другое']}

In [83]:
FIRST_SPEAKER_TOKEN = '@@ПЕРВЫЙ@@'
SECOND_SPEAKER_TOKEN = '@@ВТОРОЙ@@'

CONTEXT_COLS = ['context_3', 'context_2', 'context_1']
RESPONSE_COL = ['response']

def convert_to_dialog(sample: Dict[str, str]) -> Dict[str, str]:
    """
        Convert sample row to dialogs str format
    """
    c1 = sample['context_1'] # already filtered

    if 'context2' in sample:
        c2 = "" if sample['context_2'] is None else sample['context_2']
    else:
        c2 = ""
    
    if 'context3' in sample:
        c3 = "" if sample['context_3'] is None else sample['context_3']
    else:
        c3 = ""
    
    if 'response' in sample:
        r = "" if sample['response'] is None else sample['response']
    else:
        r = ""

    if c2 == "" and c3 == "":
        result = FIRST_SPEAKER_TOKEN + ' ' + c1 + ' ' + SECOND_SPEAKER_TOKEN + ' ' + r
    elif c2 == "" and c3 != "":
        result = FIRST_SPEAKER_TOKEN + ' ' + c3 + ' ' + FIRST_SPEAKER_TOKEN + ' ' + c1 + ' ' + SECOND_SPEAKER_TOKEN + ' ' + r
    elif c2 != "" and c3 == "":
        result = SECOND_SPEAKER_TOKEN + ' ' + c2 + ' ' + FIRST_SPEAKER_TOKEN + ' ' + c1 + ' ' + SECOND_SPEAKER_TOKEN + ' ' + r
    else:
        result = FIRST_SPEAKER_TOKEN + ' ' + c3 + ' ' + SECOND_SPEAKER_TOKEN + ' ' + c2 + ' ' + \
              FIRST_SPEAKER_TOKEN + ' ' + c1 + ' ' + SECOND_SPEAKER_TOKEN + ' ' + r
    
    return {'text': result}

In [84]:
convert_to_dialog(
    {
        'context_3': 'привет',
        'context_2': 'привет!',
        'context_1': 'как дела?',
        'response': 'супер)'
    }
)

{'text': '@@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

In [85]:
convert_to_dialog(
    {
        'context_1': 'как дела?',
        'response': 'супер)'
    }
)

{'text': '@@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

In [86]:
assert convert_to_dialog(
    {
        'context_3': 'привет',
        'context_2': 'привет!',
        'context_1': 'как дела?',
        'response': 'супер)'
    }
) == {'text': '@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет! @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

assert convert_to_dialog(
    {
        'context_1': 'как дела?',
        'response': 'супер)'
    }
) == {'text': '@@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@ супер)'}

AssertionError: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')