In [1]:
import json
import numpy as np
import pandas as pd
import math
import re

## Готовим данные

In [2]:
fields = [
    l for l in open('data_description.txt', 'r').read().split('\n')
    if len(l.strip()) > 0
]

In [3]:
columns = [f.strip().split(': ') for f in fields if not f.startswith(' ')]
col_names = {c[0]: f"{c[0]}: {c[1]}" for c in columns}
col_names['SalePrice'] = 'SalePrice'

In [4]:
last_key = None
fv = {}

for f in fields:
    if not f.startswith(' '):
        last_key = f.split(': ')[0]
        continue
    value_pair = f.strip().split('\t')
    fv[(last_key, value_pair[0])] = value_pair[1]

In [5]:
s = pd.read_csv('train.csv')

In [6]:
row_dict = s.iloc[0]

facts = []
for k in row_dict.keys():
    if k not in col_names:
        continue
    if str(row_dict[k]) == 'nan':
        continue
    facts.append(col_names[k])
    if (k, str(row_dict[k])) in fv:
        facts.append(f'- {fv[(k, str(row_dict[k]))]}')
    else:
        facts.append(f'- {row_dict[k]}')
request = '\n'.join(facts).replace('\n- ', ': ')
request = re.sub('\n[^:]+: (?:Identifies )?', '\n', request)
request = re.sub('\n(\d+)', '\nPrice: $\\1', request)

In [7]:
print(request)

MSSubClass: Identifies the type of dwelling involved in the sale.: 2-STORY 1946 & NEWER
the general zoning classification of the sale.: Residential Low Density
Linear feet of street connected to property: 65.0
Lot size in square feet: 8450
Type of road access to property: Paved
General shape of property: Regular
Flatness of the property: Near Flat/Level
Type of utilities available: All public Utilities (E,G,W,& S)
Lot configuration: Inside lot
Slope of property: Gentle slope
Physical locations within Ames city limits: College Creek
Proximity to various conditions: Normal
Proximity to various conditions (if more than one is present): Normal
Type of dwelling: Single-family Detached
Style of dwelling: Two story
Rates the overall material and finish of the house: Good
Rates the overall condition of the house: Average
Original construction date: 2003
Remodel date (same as construction date if no remodeling or additions): 2003
Type of roof: Gable
Roof material: Standard (Composite) Shingle
E

## Делаем запросы в АПИ

In [None]:
import requests

In [None]:
result = requests.post(
    url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct',
    headers={'Authorization': f'Bearer {iamtoken}', 'x-folder-id': dir_id},
    json={
      "model": "general",
      "instruction_text": "Ниже перечислены характеристики недвижимости.",
      "request_text": f"{request}\n\nНа основании этой информации о недвижимости ниже напиши продающий текст на русском. Текст не должен содержать лжи и полностью соответствовать данным из характеристик.",
      "generation_options": {
        "max_tokens": 1500,  
        "temperature": 0.5
      }
    }
)

In [None]:
print(request)

In [None]:
print(result.json()['result']['alternatives'][0]['text'])

## Fewshot

In [None]:
import pickle
train_dataset = pickle.load(open('fewshot.pkl', 'rb'))

In [None]:
print("\n------\n\n".join(v['request'] + "\n-----\n" + v['response'] for v in train_dataset))

## Finetune

In [None]:
df = pd.read_json('../squad.json', lines=True)

In [None]:
df

In [None]:
df['request'] = df.context + '\n\nQuestion: ' + df.question

In [None]:
df['response'] = df.ans_texts.str[0]

In [None]:
df = df.sample(df.shape[0])

In [None]:
data = [{'request': t.request, 'response': t.response} for t in df[['request', 'response']].itertuples()]

In [None]:
train = data[:400]

In [None]:
train = json.load(open('train.json', 'r'))

In [None]:
sample = df[~df.request.isin([t['request'] for t in train])].sample().iloc[0]
print(sample['request'])
print('-----')
print(sample['response'])