In [1]:
!pip install transformers==4.2.2

Collecting transformers==4.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 20.4MB/s eta 0:00:01[K     |▍                               | 20kB 27.5MB/s eta 0:00:01[K     |▋                               | 30kB 33.0MB/s eta 0:00:01[K     |▊                               | 40kB 27.9MB/s eta 0:00:01[K     |█                               | 51kB 28.4MB/s eta 0:00:01[K     |█▏                              | 61kB 29.5MB/s eta 0:00:01[K     |█▎                              | 71kB 31.0MB/s eta 0:00:01[K     |█▌                              | 81kB 26.3MB/s eta 0:00:01[K     |█▊                              | 92kB 27.9MB/s eta 0:00:01[K     |█▉                              | 102kB 28.1MB/s eta 0:00:01[K     |██                              | 112kB 28.1MB/s eta 0:00:01[K     |██▎                        

In [27]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
import transformers
# from transformers import AutoModel, BertTokenizerFast
# from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import TextDataset,DataCollatorForLanguageModeling

In [28]:
def build_text_files(texts, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for text in texts:
        data += text + "  "
    f.write(data)


In [29]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator


In [30]:
combined_data = pd.read_csv('combined_data.csv')
# combined_data = pd.read_csv('combined_data_sentence_broken.csv')
combined_data.head()

Unnamed: 0,link,subject,name,count,class,text
0,https://forge.medium.com/you-dont-need-more-mo...,motivation,0.txt,358,0,'one greatest talents has always been coming w...
1,https://medium.com/swlh/theres-no-such-thing-a...,motivation,1.txt,1243,0,"highly motivated.', don’t have amazing willpow..."
2,https://medium.com/the-mission/the-most-motiva...,motivation,2.txt,639,0,motivational statement comes down three words:...
3,https://medium.com/swlh/how-to-make-yourself-w...,motivation,3.txt,884,0,"break the chain.”', 'these four simple words h..."
4,https://betterhumans.pub/how-to-do-a-life-chan...,motivation,4.txt,980,0,'when most people think accountability partner...


Motivational


In [7]:
combined_data = combined_data[(combined_data['class']==0)]

train_text, valid_text, train_labels, val_labels = train_test_split(combined_data['text'].tolist(), combined_data['class'].tolist(), 
                                                                    test_size=0.15)
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

build_text_files(train_text, train_path)
build_text_files(valid_text, test_path)


In [8]:
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")


In [9]:
train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [10]:
model = AutoModelWithLMHead.from_pretrained("distilgpt2")




In [11]:
training_args = TrainingArguments(
    output_dir='./motivational.bert_lm',          
    overwrite_output_dir=True, 
    num_train_epochs=3,
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,  
    eval_steps = 400, 
    save_steps=800, 
    warmup_steps=500,
    prediction_loss_only=True,
)


trainer = Trainer(
    model=model,  
    args=training_args,
    train_dataset=train_dataset,   
    eval_dataset=test_dataset,     
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=48, training_loss=5.025032043457031, metrics={'train_runtime': 32.7384, 'train_samples_per_second': 1.466, 'total_flos': 93230928101376, 'epoch': 3.0})

In [14]:
trainer.evaluate()

{'epoch': 3.0,
 'eval_loss': 4.8174614906311035,
 'eval_runtime': 0.6301,
 'eval_samples_per_second': 122.201}

In [15]:
# model.save_pretrained("/content/gdrive/MyDrive/models/bert_classification_lm")
trainer.save_model()

In [17]:
from transformers import pipeline

pipline = pipeline('text-generation',model='./motivational.bert_lm', tokenizer='distilgpt2',config={'max_length':800})



In [19]:
pipline('being productive is to')[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'being productive is to use tools that may help,” in this situation, it is likely that you have developed tools that may help the user use tools that may help improve productivity using tools that may help productivity in your own life.\n\n\n'

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
!zip "/content/motivational.bert_lm.zip" "/content/motivational.bert_lm"
!cp "/content/motivational.bert_lm.zip" "/content/drive/MyDrive"

  adding: content/motivational.bert_lm/ (stored 0%)


non Motivational

In [33]:
combined_data = pd.read_csv('combined_data.csv')
combined_data.head()

Unnamed: 0,link,subject,name,count,class,text
0,https://forge.medium.com/you-dont-need-more-mo...,motivation,0.txt,358,0,'one greatest talents has always been coming w...
1,https://medium.com/swlh/theres-no-such-thing-a...,motivation,1.txt,1243,0,"highly motivated.', don’t have amazing willpow..."
2,https://medium.com/the-mission/the-most-motiva...,motivation,2.txt,639,0,motivational statement comes down three words:...
3,https://medium.com/swlh/how-to-make-yourself-w...,motivation,3.txt,884,0,"break the chain.”', 'these four simple words h..."
4,https://betterhumans.pub/how-to-do-a-life-chan...,motivation,4.txt,980,0,'when most people think accountability partner...


In [34]:
combined_data = combined_data[(combined_data['class'] == 1)]

train_text, valid_text, train_labels, val_labels = train_test_split(combined_data['text'].tolist(), combined_data['class'].tolist(), 
                                                                    test_size=0.15)
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

build_text_files(train_text, train_path)
build_text_files(valid_text, test_path)


In [35]:
train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [36]:
training_args = TrainingArguments(
    output_dir='./nonMotivational.bert_lm',          
    overwrite_output_dir=True, 
    num_train_epochs=3,
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,  
    eval_steps = 400, 
    save_steps=800, 
    warmup_steps=500,
    prediction_loss_only=True,
)


trainer = Trainer(
    model=model,                    
    args=training_args,             
    train_dataset=train_dataset,    
    eval_dataset=test_dataset,      
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=48, training_loss=4.80613644917806, metrics={'train_runtime': 33.2198, 'train_samples_per_second': 1.445, 'total_flos': 93230928101376, 'epoch': 3.0})

In [40]:
trainer.evaluate()

{'epoch': 3.0,
 'eval_loss': 4.683315753936768,
 'eval_runtime': 0.6317,
 'eval_samples_per_second': 121.902}

In [41]:
# model.save_pretrained("/content/gdrive/MyDrive/models/bert_classification_lm")
trainer.save_model()

In [42]:
from transformers import pipeline

pipline = pipeline('text-generation',model='./nonMotivational.bert_lm', tokenizer='distilgpt2',config={'max_length':800})



In [48]:
pipline('politic is')[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'politic is about social justice, but does work well for social justice as far as society is concerned. This has taken my life, I have had a difficult time doing so much as I’m ashamed and even angry,’ and I'

In [49]:
!zip "/content/nonMotivational.bert_lm.zip" "/content/nonMotivational.bert_lm"
!cp "/content/nonMotivational.bert_lm.zip" "/content/drive/MyDrive"

  adding: content/nonMotivational.bert_lm/ (stored 0%)
