In [1]:
import transformers

In [2]:
from transformers import RobertaConfig, RobertaModel

In [3]:
configuration = RobertaConfig()

In [4]:
model = RobertaModel(configuration)

In [5]:
configuration = model.config

In [6]:
model.config

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.3.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [7]:
from transformers import RobertaTokenizer, RobertaForMaskedLM

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [9]:
tokenizer("Hello world")['input_ids']

[0, 31414, 232, 2]

In [10]:
model = RobertaForMaskedLM.from_pretrained('roberta-base')

In [11]:
from datasets import load_dataset

In [12]:
dataset = load_dataset('json', data_files='/scr/amazon-sentiment/Appliances_5.json.gz', split='train')

Using custom data configuration default-49a460ae4ccd0cd4
Reusing dataset json (/sailhome/rmjones/.cache/huggingface/datasets/json/default-49a460ae4ccd0cd4/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02)


In [13]:
dataset

Dataset({
    features: ['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'style', 'vote', 'image'],
    num_rows: 2277
})

In [14]:
columns_to_remove = list(dataset.features.keys())
columns_to_remove.remove('reviewText')
columns_to_remove.remove('summary')
columns_to_remove.remove('overall')

In [15]:
columns_to_remove

['verified',
 'reviewTime',
 'reviewerID',
 'asin',
 'reviewerName',
 'unixReviewTime',
 'style',
 'vote',
 'image']

In [16]:
dataset = dataset.remove_columns(columns_to_remove)

In [17]:
mapped_dataset = dataset.map(lambda row: {'text': row['summary'] + '. ' + row['reviewText']})

HBox(children=(FloatProgress(value=0.0, max=2277.0), HTML(value='')))




In [18]:
mapped_dataset

Dataset({
    features: ['overall', 'reviewText', 'summary', 'text'],
    num_rows: 2277
})

In [19]:
mapped_dataset[2000]

{'overall': 3.0,
 'summary': 'Does what it needs to',

In [20]:
encoded_dataset = mapped_dataset.map(lambda examples: tokenizer(examples['text']), batched=True)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (987 > 512). Running this sequence through the model will result in indexing errors





In [21]:
encoded_dataset = encoded_dataset.filter(lambda row: len(row['input_ids']) <= 512)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [22]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [23]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta-retrained",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=48,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=encoded_dataset
)

In [None]:
trainer.train()

In [None]:
# align products with reviews
reviews = {}
summaries = {}
for data_file in data_files:
    local_reviews = defaultdict(list)
    local_summaries = defaultdict(list)
    with gzip.open(os.path.join(DATA_HOME, data_file)) as f:
        i = 0
        for l in tqdm(f):
            r = json.loads(l)
            local_reviews[int(r['overall'])].append(r['reviewText'].replace("\n", "") if 'reviewText' in r else "")
            summary = r['summary'] if 'summary' in r else ""
            summary = summary if summary.strip().endswith('.') else summary + "."
            local_summaries[int(r['overall'])].append(summary)
    reviews[data_file[:data_file.find(".json.gz")]] = local_reviews
    summaries[data_file[:data_file.find(".json.gz")]] = local_summaries