Referrences: https://colab.research.google.com/drive/15wa925dj7jvdvrz8_z3vU7btqAFQLVlG#scrollTo=zVSTlysV2jaM

In [1]:
import os
os.chdir("/content/drive/My Drive/reddit")

In [2]:
! pip -q install transformers

[K     |████████████████████████████████| 3.3 MB 7.1 MB/s 
[K     |████████████████████████████████| 61 kB 79 kB/s 
[K     |████████████████████████████████| 895 kB 50.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 60.5 MB/s 
[K     |████████████████████████████████| 596 kB 62.6 MB/s 
[?25h

In [3]:
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [4]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-medium'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-medium'
        self.config_name = 'microsoft/DialoGPT-medium'
        self.tokenizer_name = 'microsoft/DialoGPT-medium'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 1
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

In [5]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-large")

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [6]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("CryptoBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:What do you know about bitcoin?
CryptoBot: You don't need to know anything about bitcoin to know that it's not a good investment.
>> User:Why?
CryptoBot: Because it's a speculative investment. The market is unpredictable.
>> User:What about ethereum?
CryptoBot: That's the one I've been looking at.
>> User:Is it better than bitcoin?
CryptoBot: I haven't tried it yet.
>> User:What do you suggest me buy then?
CryptoBot: Why do you want to buy?


# Data

## Data collection

Major reddit communities (subreddits) related to the topic Crypto:
- CryptoCurrency 4.1m member
- ethereum 1.2m 
- dogecoin 2.2m
- CryptoMarkets 676k 
- Crypto_Currency_News 121k


### Reddit API using PRAW

top 20 of most popular post thread of all time from each community/subreddit. 

In [None]:
pip install praw

Collecting praw
  Downloading praw-7.5.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 5.0 MB/s 
[?25hCollecting websocket-client>=0.54.0
  Downloading websocket_client-1.2.3-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.8 MB/s 
[?25hCollecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.5.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.2.3


In [None]:
import praw
reddit = praw.Reddit(
    user_agent="Comment Extraction (by u/USERNAME)",
    client_id="E3blNGsEC5BNeG0OERHr7Q",
    client_secret="EFeePuCd0B0N4Qn-xOKOrGfD4T-Ajg",
    redirect_uri='http://127.0.0.1'
    # username="USERNAME",
    # password="PASSWORD"
)

In [None]:
import time
from tqdm.notebook import tqdm

In [None]:
url_cryptocurrency = ['https://www.reddit.com/r/CryptoCurrency/comments/n7rl2y/you_hear_about_the_kid_who_put_in_500_into_a/',
        'https://www.reddit.com/r/CryptoCurrency/comments/noztp7/binance_ceo_cz_shades_elon_musk_in_tweet_when_you/',
        'https://www.reddit.com/r/CryptoCurrency/comments/razvtu/aoc_reveals_she_doesnt_hold_bitcoin_because_she/',
        'https://www.reddit.com/r/CryptoCurrency/comments/nch8rm/its_been_a_crazy_ride_these_past_7_years_but_im/',
        'https://www.reddit.com/r/CryptoCurrency/comments/mntcz3/elon_musk_is_not_one_of_us_stop_using_him_as_a/',
        'https://www.reddit.com/r/CryptoCurrency/comments/p6eppn/britney_spears_has_been_using_bitcoin_since_2014/',
        'https://www.reddit.com/r/CryptoCurrency/comments/ngwelr/elon_musks_affect_on_crypto_is_completely/',
        'https://www.reddit.com/r/CryptoCurrency/comments/lfrslo/reminder_robinhood_blocked_several_stocks_from/',
        'https://www.reddit.com/r/CryptoCurrency/comments/nb0yz5/elon_musk_tesla_stops_accepting_bitcoin_as/',
        'https://www.reddit.com/r/CryptoCurrency/comments/7r0ftz/cryptonick_is_deleting_all_of_his_bitconnect/',
        'https://www.reddit.com/r/CryptoCurrency/comments/pw0kgt/leaked_documents_show_that_citadels_ceo_lied/',
        'https://www.reddit.com/r/CryptoCurrency/comments/p3tff2/my_daughter_is_dating_a_douchebag/',
        'https://www.reddit.com/r/CryptoCurrency/comments/qkai4d/squid_game_from_2856_to_00008_in_10_minutes/',
        'https://www.reddit.com/r/CryptoCurrency/comments/o6k1a8/john_mcafee_found_dead_in_prison_cell_after/',
        'https://www.reddit.com/r/CryptoCurrency/comments/lijzxr/if_i_had_bought_btc_in_the_early_2010s_at_10/',
        'https://www.reddit.com/r/CryptoCurrency/comments/mtghzc/what_are_some_ways_to_earn_some_crypto_2_for/',
        'https://www.reddit.com/r/CryptoCurrency/comments/n1vwak/a_wallet_is_not_what_you_think_it_is/',
        'https://www.reddit.com/r/CryptoCurrency/comments/o7k0b4/i_made_a_post_about_the_scam_opishub_yesterday/',
        'https://www.reddit.com/r/CryptoCurrency/comments/mwwffa/people_that_say_imagine_if_dogecoin_went_to_10_or/']


In [None]:
url_ethereum = [
                'https://www.reddit.com/r/ethereum/comments/n5wjjq/wonderful_explanation_of_whats_ethereum/',
                'https://www.reddit.com/r/ethereum/comments/oyglcw/its_here_folks_the_london_upgrade/',
                'https://www.reddit.com/r/ethereum/comments/7mve4y/vitalik_buterin_cryptocurrency_should_focus_less/',
               'https://www.reddit.com/r/ethereum/comments/qxxyet/nft/',
                'https://www.reddit.com/r/ethereum/comments/l6c3kx/reddit_announces_partnership_with_the_ethereum/',
                'https://www.reddit.com/r/ethereum/comments/nswuyf/bitcoin_miami_conference_warns_attendees_its_a/',
                'https://www.reddit.com/r/ethereum/comments/7qckfb/the_ethereum_blockchain_now_processes_about_as/',
                'https://www.reddit.com/r/ethereum/comments/nrfu99/mark_mic_dropping/',
                'https://www.reddit.com/r/ethereum/comments/o7sx7x/i_see_everyone_getting_exited_over_burning_eth_am/',
                'https://www.reddit.com/r/ethereum/comments/njdhxn/goldman_sachs_calls_ethereum_the_amazon_of/',
                'https://www.reddit.com/r/ethereum/comments/ngb137/if_only_it_were_this_easy/',
                'https://www.reddit.com/r/ethereum/comments/pl2149/lets_say_i_have_100_eth_sitting_in_a_wallet_that/',
                'https://www.reddit.com/r/ethereum/comments/7f0872/fight_to_save_net_neutrality_today/',
                'https://www.reddit.com/r/ethereum/comments/7lrlrs/on_the_door_to_my_european_repair_shop_looking/',
                'https://www.reddit.com/r/ethereum/comments/p1jktl/i_think_this_is_very_important_ive_never_agreed/',
                'https://www.reddit.com/r/ethereum/comments/ox7tel/paid_for_my_dunkins_with_eth_had_to_show_you_guys/',
                'https://www.reddit.com/r/ethereum/comments/7ruz2a/canada_trialing_use_of_ethereum_blockchain_to/',
                'https://www.reddit.com/r/ethereum/comments/ozw2c6/accepting_eth_at_my_farmers_market_stand/',
                'https://www.reddit.com/r/ethereum/comments/lv9a77/poll_should_we_ban_nft_advertisements_from/',
                'https://www.reddit.com/r/ethereum/comments/p7j9mw/this_sub_is_getting_astroturfed_by_bitcoin/']

In [None]:
url_news = ['https://www.reddit.com/r/Crypto_Currency_News/comments/p3gxs1/cardano_ada_just_became_the_3rd_most_valuable/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/prph6t/el_salvador_buys_the_bitcoin_dip_again_according/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/q4ipwq/shiba_inu_took_14_months_to_hit_a_market_value_of/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/pizhud/vast_becomes_first_federally_chartered_us_bank_to/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/p9sb9g/bitcoin_above_50000_while_cardano_is_busy_making/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/q5pilq/edward_snowden_government_digital_coins_cbdcs_are/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/od1i2u/elon_musk_is_losing_his_power_over_the_crypto/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/n32ube/elon_musk_confirms_dogecoin_will_be_part_of_his/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/q6hp2u/jpmorgan_ceo_again_calls_bitcoin_worthless_edward/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/neyifd/cardano_ada_flawlessly_beats_bitcoin_gold_and/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/qpa6l7/crypto_whale_buys_20_trillion_shiba_inu_tokens/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/pp768s/amc_the_biggest_cinema_chain_worldwide_with_1004/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/7pxqhw/top_100_crypto_currencies_described_in_4_words_or/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/nea0r7/mark_cuban_says_dallas_mavericks_will_be_pleased/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/phhx5t/cardano_ada_gives_unequivocal_no_to_vaccine/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/nsywmj/they_just_posted_this_regarding_elon/',
            'https://www.reddit.com/user/google/comments/qshgl6/megathread_the_best_of_google_available_on/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/q52dkb/president_nayib_bukele_says_el_salvador_will_use/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/7yd9d8/just_kidding_thats_us_dollar/',
            'https://www.reddit.com/r/Crypto_Currency_News/comments/ouf520/michael_saylor_bitcoin_is_digital_real_estate_and/']

In [None]:
url_market = ['https://www.reddit.com/r/CryptoMarkets/comments/ms96il/when_youve_spent_the_past_4_years_carefully/',
              'https://www.reddit.com/r/CryptoMarkets/comments/7qvw5f/weak_hands_take_notice/',
              'https://www.reddit.com/r/CryptoMarkets/comments/n184f8/i_just_keep_hodling_my_bags/',
              'https://www.reddit.com/r/CryptoMarkets/comments/nn3gnj/the_3_rules_of_the_crypto_club/',
              'https://www.reddit.com/r/CryptoMarkets/comments/nrwfze/elon_musk_is_becoming_the_most_hated_person_in/',
              'https://www.reddit.com/r/CryptoMarkets/comments/ncubti/the_results_are_in_and_they_look_pretty_conclusive/',
              'https://www.reddit.com/r/CryptoMarkets/comments/nqfqc9/the_king_of_hodl/',
              'https://www.reddit.com/r/CryptoMarkets/comments/n7p6lg/getin_ready_for_moon/',
              'https://www.reddit.com/r/CryptoMarkets/comments/n8j6cp/daddy_musk_is_not_gonna_like_it_but_its_true/',
              'https://www.reddit.com/r/CryptoMarkets/comments/lgaiku/there_is_a_dogecoin_wallet_that_holds_over_2/',
              'https://www.reddit.com/r/CryptoMarkets/comments/mtbkvk/hello_darkness_my_old_friend/',
              'https://www.reddit.com/r/CryptoMarkets/comments/mwkl5t/hahaha_i_aint_scared_im_keeping_my_crypto/',
              'https://www.reddit.com/r/CryptoMarkets/comments/nsmiwx/reminiscing_those_good_old_days_when_btc_is_on/',
              'https://www.reddit.com/r/CryptoMarkets/comments/nhxwbu/maybe_ban_china/',
              'https://www.reddit.com/r/CryptoMarkets/comments/o7ive5/buy_the_dip/',
              'https://www.reddit.com/r/CryptoMarkets/comments/mrb6rs/take_me_to_the_moon/',
              'https://www.reddit.com/r/CryptoMarkets/comments/mzbzak/elon_musk_owns_bitcoin_tesla_owns_bitcoin_those/',
              'https://www.reddit.com/r/CryptoMarkets/comments/mxgmpk/bought_at_615k_last_saturday_then_sunday_comes/',
              'https://www.reddit.com/r/CryptoMarkets/comments/n748pu/choose_wisely/',
              'https://www.reddit.com/r/CryptoMarkets/comments/np2egz/me_waiting_for_100k_btc/',
              ]

In [None]:
urls = url_cryptocurrency 
#+ url_news + url_ethereum + url_market

Only use the first level comments, ignoring branches

In [None]:
!rm crypto_reddit_comments.txt
post_dict = []
for url in tqdm(urls):
  submission = reddit.submission(url=url)
  submission.comments.replace_more(limit=0) # flatten tree, adjust here to include branching comments
  comments = submission.comments.list() # all comments
  title_text = submission.title
  post_dict.append({'title': title_text, 'comments': [comment.body for comment in comments]})
  # time.sleep(1.5)

In [None]:
import json
with open('crypto_reddit_comments.txt',  'w') as outfile:
    json.dump(post_dict, outfile)  

In [None]:
post_dict

Output hidden; open in https://colab.research.google.com to view.

#### Extract by subreddit (limited by 1000 most recent post; discarded)

In [None]:
communities = ['CryptoCurrency', 'ethereum', 'dogecoin', 'ethtrader', 'Bitcoin']

In [None]:
post_dict = []
with open('drive/MyDrive/reddit/crypto_reddit.txt', 'a') as f:
  for community in communities:
    for submission in reddit.subreddit(community).hot(limit=1000):
      submission.comments.replace_more(limit=0)
      author_text = submission.selftext
      comments = submission.comments.list()
      for comment in comments:
        f.write(comment.body + '\n')
      post_dict.append({'author': author_text, 'comments': [comment.body for comment in comments]})



### Pushshift (postponed)

There are two main wrappers created in Python for Pushshift, psaw, and **pmaw**. For the creation of large datasets I would recommend using **pmaw**, it’s a package I created that is highly optimized for extracting large amounts of data, running 1.79x faster than psaw from the benchmarks I performed with up to 400,000 submissions. **pmaw** has built-in rate-limiting, pagination, and runs requests on multiple threads, all we have to do is define our query based on the Pushshift endpoint parameters.

Referrence: https://medium.com/swlh/how-to-scrape-large-amounts-of-reddit-data-using-pushshift-1d33bde9286

In [None]:
pip install pmaw pandas


Collecting pmaw
  Downloading pmaw-2.1.1-py3-none-any.whl (25 kB)
Collecting praw
  Downloading praw-7.5.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 17.7 MB/s 
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.2.3-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
[?25hCollecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw, pmaw
Successfully installed pmaw-2.1.1 praw-7.5.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.2.3


In [None]:
import pandas as pd
from pmaw import PushshiftAPI
api = PushshiftAPI()

In [None]:
import datetime as dt
before = int(dt.datetime(2021,12,1,0,0).timestamp())
after = int(dt.datetime(2021,1,1,0,0).timestamp())

In [None]:
subreddit="CryptoCurrency"
# limit=100000
comments = api.search_comments(subreddit=subreddit)
print(f'Retrieved {len(comments)} comments from Pushshift')

INFO:pmaw.PushshiftAPIBase:22419474 result(s) available in Pushshift
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 100 - Batches: 10 - Items Remaining: 22409577
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 200 - Batches: 20 - Items Remaining: 22399780
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 300 - Batches: 30 - Items Remaining: 22389884
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 400 - Batches: 40 - Items Remaining: 22379984
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 500 - Batches: 50 - Items Remaining: 22370086
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 600 - Batches: 60 - Items Remaining: 22360087
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 630 - Batches: 63 - Items Remaining: 22357088
Retrieved 62386 comments from Pushshift


In [None]:
comments.responses[1]

{'all_awardings': [],
 'associated_award': None,
 'author': 'MrMoustacheMan',
 'author_flair_background_color': '',
 'author_flair_css_class': None,
 'author_flair_richtext': [{'e': 'text', 't': ' Vires in Numeris'}],
 'author_flair_template_id': None,
 'author_flair_text': ' Vires in Numeris',
 'author_flair_text_color': 'dark',
 'author_flair_type': 'richtext',
 'author_fullname': 't2_9uzrl',
 'author_patreon_flair': False,
 'author_premium': True,
 'awarders': [],
 'body': 'No CEX will get my ETH, waiting for RocketPool',
 'collapsed_because_crowd_control': None,
 'comment_type': None,
 'created_utc': 1610912686,
 'gildings': {},
 'id': 'gjn2hf2',
 'is_submitter': False,
 'link_id': 't3_kyuo5b',
 'locked': False,
 'no_follow': True,
 'parent_id': 't1_gjmxyam',
 'permalink': '/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn2hf2/',
 'retrieved_on': 1611025014,
 'score': 1,
 'send_replies': True,
 'stickied': False,
 'subreddit': 'CryptoCurrency',
 'subreddit

In [None]:
comments_df = pd.DataFrame(comments)
# preview the comments data
comments_df.head(5)

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,author_patreon_flair,author_premium,awarders,body,collapsed_because_crowd_control,comment_type,created_utc,gildings,id,is_submitter,link_id,locked,no_follow,parent_id,permalink,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,distinguished,media_metadata,collapsed_reason_code,archived,body_sha1,can_gild,collapsed,collapsed_reason,controversiality,gilded,retrieved_utc,score_hidden,subreddit_name_prefixed,subreddit_type,unrepliable_reason,author_cakeday,edited
0,[],,unc4l1n,,,[],,,,text,t2_4613kbtu,False,True,[],"Probably not this dip, but within 3 weeks or so.",,,1610912695,{},gjn2i8w,False,t3_kyuo5b,False,True,t1_gjn0x6u,/r/CryptoCurrency/comments/kyuo5b/daily_discus...,1611025000.0,1,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,,,
1,[],,MrMoustacheMan,,,"[{'e': 'text', 't': ' Vires in Numeris'}]",,Vires in Numeris,dark,richtext,t2_9uzrl,False,True,[],"No CEX will get my ETH, waiting for RocketPool",,,1610912686,{},gjn2hf2,False,t3_kyuo5b,False,True,t1_gjmxyam,/r/CryptoCurrency/comments/kyuo5b/daily_discus...,1611025000.0,1,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,,,
2,[],,[deleted],,,,,,dark,,,,,[],[deleted],,,1610912679,{},gjn2grp,False,t3_kzaaq0,False,True,t1_gjn12ty,/r/CryptoCurrency/comments/kzaaq0/sometimes_im...,1611025000.0,1,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,,,
3,[],,cryptomhanks,,Transitioning,"[{'e': 'text', 't': ' '}]",,,dark,richtext,t2_16ffgpmu,False,False,[],Thoughts on BnB guys ?,,,1610912677,{},gjn2gko,False,t3_kyuo5b,False,True,t3_kyuo5b,/r/CryptoCurrency/comments/kyuo5b/daily_discus...,1611025000.0,3,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,,,
4,[],,iwakan,,Ethereum,"[{'e': 'text', 't': ' Hermit'}]",,Hermit,dark,richtext,t2_17hauh,False,False,[],Look in the sidebar and you can see how many y...,,,1610912652,{},gjn2e77,False,t3_kyuo5b,False,True,t1_gjn0iht,/r/CryptoCurrency/comments/kyuo5b/daily_discus...,1611025000.0,1,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,,,


In [None]:
comments_df.to_csv('drive/MyDrive/reddit/cryptocurrency_comments.csv', header=True, index=False, columns=list(comments_df.axes[1]))

In [None]:
for i in range(10):
  print(i, comments_df['body'][i], '\n')

0 Probably not this dip, but within 3 weeks or so. 

1 No CEX will get my ETH, waiting for RocketPool 

2 [deleted] 

3 Thoughts on BnB guys ? 

4 Look in the sidebar and you can see how many you have or can claim. 

5 Exactly what I thought. Lol. Thanks! 

6 It doesn't matter who writes code or hosts what or anything.

The website falls under their responsibility, and they failed. It means they cyber security processes are flawed.

Which means: what other flaws are there? Are some related to their actual products?


When your entire business is about security and trust, this is absolutely a significant breach of trust. 

7 You think everyone here is 16? I allready own a house. 

8 It's a new one if making lots of money is a new one for you. 4chan was first to BTC, ETH, NEO (Antshares at the time) &amp; LINK. 

9 Thanks 🙏🏻 



In [None]:
for i in range(10):
  print(comments_df['permalink'][i])

/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn2i8w/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn2hf2/
/r/CryptoCurrency/comments/kzaaq0/sometimes_im_glad_i_didnt_invest_early_in_btc/gjn2grp/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn2gko/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn2e77/
/r/CryptoCurrency/comments/kz9ano/65_say_they_would_consider_selling_bitcoin_if_the/gjn2ctp/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn2bcl/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn293b/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn27xu/
/r/CryptoCurrency/comments/kyuo5b/daily_discussion_january_17_2021_gmt0/gjn24k6/


In [None]:
comments_df[comments_df.link_id == 't3_kyuo5b'].to_csv('drive/MyDrive/reddit/check.csv', header=True, index=False, columns=list(comments_df.axes[1]))

In [None]:
comments_df.columns

Index(['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'id', 'is_submitter', 'link_id', 'locked',
       'no_follow', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'send_replies', 'stickied', 'subreddit', 'subreddit_id',
       'top_awarded_type', 'total_awards_received', 'treatment_tags',
       'distinguished', 'media_metadata', 'collapsed_reason_code', 'archived',
       'body_sha1', 'can_gild', 'collapsed', 'collapsed_reason',
       'controversiality', 'gilded', 'retrieved_utc', 'score_hidden',
       'subreddit_name_prefixed', 'subreddit_type', 'unrepliable_reason',
       'author_cakeday', 'ed

In [None]:
!python --version

Python 3.7.12


## Data preparation

In [5]:
import pandas as pd

Tokenizer
This step partly follows Zhang, et al. (2020) paper on training of DialoGPT by removing the instances where:

1. there is a URL in source or target, 
4. where the response contains special markers such as “[” or “]”, as this could be markup language, 
5. where source and target sequences together are longer than 200 words, 
6. where the source and target sequences are of meaningless maintainance language such as '[deleted]'


In [6]:
def clean_sentence(raw_lines):
  clean_lines = []
  for line in raw_lines:
      line = line.strip('\n').strip('&nbsp').replace('*', '').strip('-').replace('#', '').replace('\n', '')
      if ('http' not in line and 
          'NOTE' not in line and 
          '[deleted]' not in line and 
          '[removed]' not in line and 
          '*I am a bot, ' not in line and 
          'Your comment was removed because' not in line and
          len(line.strip().split(" "))<200):
          clean_lines.append(line)
  return clean_lines

In [7]:
import nltk.data
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import json
with open('crypto_reddit_comments.txt') as json_file:
    data = json.load(json_file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
len(data)

79

In [9]:
cleaned_reddits = []
for submission in data[:20]:
  if submission['comments'] != []:
      cleaned_reddits.append({'title': clean_sentence([submission['title']])[0],
                              'comments': clean_sentence(submission['comments'])})

In [10]:
len(cleaned_reddits)

20

In [11]:
contexted = []

n = 6

for cleaned_reddit in cleaned_reddits:  
  for i in range(n, len(cleaned_reddit['comments'])):
    row = []
    prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 6 previous responces  
    for j in range(i, prev, -1):
      row.append(cleaned_reddit['comments'][j])
    row.append(cleaned_reddit['title']) # the title of the thread used as context as well
    contexted.append(row)  

In [12]:
len(contexted)

8850

In [13]:
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n)]
columns

['response',
 'context',
 'context/0',
 'context/1',
 'context/2',
 'context/3',
 'context/4',
 'context/5']

In [14]:
df = pd.DataFrame.from_records(contexted, columns=columns)
df.head(5)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
0,You never hear about people like me that start...,Yeah imagine being the one to buy at the top o...,Everybody talks about the Amazon that they tho...,I'm one of those guys who fomo'd in 2017 and e...,Survivorship bias is real and it's everywhere,You also don’t hear about me turning my $20 in...,My buddy (who's a successful business person w...,You hear about the kid who put in $500 into a ...
1,You also don't hear about the early Bitcoin in...,You never hear about people like me that start...,Yeah imagine being the one to buy at the top o...,Everybody talks about the Amazon that they tho...,I'm one of those guys who fomo'd in 2017 and e...,Survivorship bias is real and it's everywhere,You also don’t hear about me turning my $20 in...,You hear about the kid who put in $500 into a ...
2,I turned $0.06 into $8.28,You also don't hear about the early Bitcoin in...,You never hear about people like me that start...,Yeah imagine being the one to buy at the top o...,Everybody talks about the Amazon that they tho...,I'm one of those guys who fomo'd in 2017 and e...,Survivorship bias is real and it's everywhere,You hear about the kid who put in $500 into a ...
3,This isnt an reddit post anymore. This is life...,I turned $0.06 into $8.28,You also don't hear about the early Bitcoin in...,You never hear about people like me that start...,Yeah imagine being the one to buy at the top o...,Everybody talks about the Amazon that they tho...,I'm one of those guys who fomo'd in 2017 and e...,You hear about the kid who put in $500 into a ...
4,The need to “Win more” (greed) is a weird quir...,This isnt an reddit post anymore. This is life...,I turned $0.06 into $8.28,You also don't hear about the early Bitcoin in...,You never hear about people like me that start...,Yeah imagine being the one to buy at the top o...,Everybody talks about the Amazon that they tho...,You hear about the kid who put in $500 into a ...


Split into train and test sets

In [15]:
from sklearn.model_selection import train_test_split
trn_df, val_df = train_test_split(df, test_size = 0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
4802,You wouldn’t have lost any if they wouldn’t h...,Sneaky hobbitse,Hating Robinhood is part of the culture,All our homies should hate them,My mom hate them,Yup. That was why everyone used robinhood. The...,They turned off selling when their buddies wer...,Leaked Documents Show that Citadels CEO Lied U...
1158,"Not only they hold investments, they also rece...",In my opinion they should just have a selectio...,"Yeah, even with a blind trust they could still...",This. Similar to company execs they should onl...,This is an idea. But still could lead them to ...,Most employees of publicly traded companies ar...,This right here.,AOC reveals she doesn't hold bitcoin because s...
6442,Thanks for the info,I'm not sure what understanding the potential ...,Even if that guy held onto 10-100 BTC I doubt ...,To be fair part of what got Bitcoin rolling wa...,That guy did more for bitcoin than any of us e...,"Starlink, calling it now.",So what you're saying is you have a friend who...,"""If I had bought BTC in the early 2010's at 10..."
8751,I personally dont think im to concerned about ...,>If they're eating a portion of transactions w...,Except when you handle payments by using an ex...,Thanks. I didn't want to sound too confident ...,So eth is use and ethereum is ebay?? Oh no!,Sweet. I’m slowly but surely understanding mor...,"No worries, completely understandable as it mu...",Wonderful explanation of what's Ethereum.
403,Still see the hammer when I close my eye,I mean IN your ear,I suppose the key to success here is to not dw...,"Haha, I’m actually in it but it’s my gamble money",To hodl or to cancle? That is the question.,I fucked myself over with Dragonchain.,Thanks. Gonna check it out right away.,You hear about the kid who put in $500 into a ...


Now will convert our dataset in a format suitable for our model. Basically we will concatenate responses in one string for each row (additionally we will add special 'end of string' token between responses, so the model will understand end of each response in a string).  

Minor edit of the original code: change max_len to model_max_length.

https://stackoverflow.com/questions/67089849/attributeerror-gpt2tokenizerfast-object-has-no-attribute-max-len

In [16]:
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [17]:
# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

# Model

## Initialization

In [18]:
from torch.nn.utils.rnn import pad_sequence
a = torch.ones(25, 300)
b = torch.ones(22, 300)
c = torch.ones(15, 300)
pad_sequence([b, c]).size()

torch.Size([22, 2, 300])

## Training

In [19]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [20]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [21]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7411 sha256=401a8400ede96baa42998639a41766882018337731f0111c05521ae1fb1a90d5
  Stored in directory: /root/.cache/pip/wheels/6e/f8/83/534c52482d6da64622ddbf72cd93c35d2ef2881b78fd08ff0c
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 25.8 GB  |     Proc size: 1.6 GB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total     16280MB


In [22]:
main(trn_df, val_df)



Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

12/14/2021 17:27:13 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f8e7fc5c8d0>
12/14/2021 17:27:13 - INFO - __main__ -   Creating features from dataset file at cached
12/14/2021 17:27:24 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
12/14/2021 17:27:25 - INFO - __main__ -   ***** Running training *****
12/14/2021 17:27:25 - INFO - __main__ -     Num examples = 7965
12/14/2021 17:27:25 - INFO - __main__ -     Num Epochs = 3
12/14/2021 17:27:25 - INFO - __main__ -     Instantaneous batch size per GPU = 1
12/14/2021 17:27:25 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
12/14/2021 17:27:25 - INFO - __main__ -     Gradient Accumulation steps = 1
12/14/2021 17:27:25 - INFO - __main__ -     Total optimization steps = 23895


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7965 [00:00<?, ?it/s]

12/14/2021 17:39:15 - INFO - __main__ -   Saving model checkpoint to output-medium/checkpoint-3500
12/14/2021 17:39:41 - INFO - __main__ -   Saving optimizer and scheduler states to output-medium/checkpoint-3500
12/14/2021 17:51:25 - INFO - __main__ -   Saving model checkpoint to output-medium/checkpoint-7000
12/14/2021 17:51:52 - INFO - __main__ -   Saving optimizer and scheduler states to output-medium/checkpoint-7000


Iteration:   0%|          | 0/7965 [00:00<?, ?it/s]

12/14/2021 18:03:34 - INFO - __main__ -   Saving model checkpoint to output-medium/checkpoint-10500
12/14/2021 18:04:01 - INFO - __main__ -   Saving optimizer and scheduler states to output-medium/checkpoint-10500
12/14/2021 18:15:49 - INFO - __main__ -   Saving model checkpoint to output-medium/checkpoint-14000
12/14/2021 18:16:20 - INFO - __main__ -   Saving optimizer and scheduler states to output-medium/checkpoint-14000


Iteration:   0%|          | 0/7965 [00:00<?, ?it/s]

12/14/2021 18:28:13 - INFO - __main__ -   Saving model checkpoint to output-medium/checkpoint-17500
12/14/2021 18:28:39 - INFO - __main__ -   Saving optimizer and scheduler states to output-medium/checkpoint-17500
12/14/2021 18:40:24 - INFO - __main__ -   Saving model checkpoint to output-medium/checkpoint-21000
12/14/2021 18:40:51 - INFO - __main__ -   Saving optimizer and scheduler states to output-medium/checkpoint-21000
12/14/2021 18:50:35 - INFO - __main__ -    global_step = 23895, average loss = 1.2450547602687903
12/14/2021 18:50:35 - INFO - __main__ -   Saving model checkpoint to output-medium
12/14/2021 18:50:47 - INFO - __main__ -   Evaluate the following checkpoints: ['output-medium']
12/14/2021 18:50:52 - INFO - __main__ -   Creating features from dataset file at cached
12/14/2021 18:50:54 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
12/14/2021 18:50:55 - INFO - __main__ -   ***** Running evaluation  *****
12/14/2021 18:50:55 - INFO - __m

Evaluating:   0%|          | 0/221 [00:00<?, ?it/s]

12/14/2021 18:51:43 - INFO - __main__ -   ***** Eval results  *****
12/14/2021 18:51:43 - INFO - __main__ -     perplexity = tensor(2.0860)


{'perplexity_': tensor(2.0860)}

In [None]:
main(trn_df, val_df)

12/13/2021 05:11:41 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f9ab0c35050>
12/13/2021 05:11:41 - INFO - __main__ -   Creating features from dataset file at cached
Token indices sequence length is longer than the specified maximum sequence length for this model (1128 > 1024). Running this sequence through the model will result in indexing errors
12/13/2021 05:12:18 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
12/13/2021 05:12:19 - INFO - __main__ -   ***** Running training *****
12/13/2021 05:12:19 - INFO - __main__ -     Num examples = 17196
12/13/2021 05:12:19 - INFO - __main__ -     Num Epochs = 3
12/13/2021 05:12:19 - INFO - __main__ -     Instantaneous batch size per GPU = 2
12/13/2021 05:12:19 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 2
12/13/2021 05:12:19 - INFO - __main__ -     Gradient Accumulation steps = 1
12/13/2021 05:12:19 - INFO - __main__ -     Tota

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8598 [00:00<?, ?it/s]

12/13/2021 05:39:16 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-3500
12/13/2021 05:39:43 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-3500
12/13/2021 06:06:30 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-7000
12/13/2021 06:06:58 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-7000


Iteration:   0%|          | 0/8598 [00:00<?, ?it/s]

12/13/2021 06:33:46 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-10500
12/13/2021 06:33:54 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-10500
12/13/2021 07:00:51 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-14000
12/13/2021 07:00:56 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-14000


Iteration:   0%|          | 0/8598 [00:00<?, ?it/s]

12/13/2021 07:27:45 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-17500
12/13/2021 07:27:51 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-17500
12/13/2021 07:54:36 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-21000
12/13/2021 07:54:41 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-21000
12/13/2021 08:21:31 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-24500
12/13/2021 08:21:38 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-24500
12/13/2021 08:31:22 - INFO - __main__ -    global_step = 25764, average loss = 2.156358761334943
12/13/2021 08:31:22 - INFO - __main__ -   Saving model checkpoint to output-small
12/13/2021 08:31:28 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
12/13/2021 08:31:31 - INFO - __main__ -   Creating features from dataset file at cached
12

Evaluating:   0%|          | 0/477 [00:00<?, ?it/s]

12/13/2021 08:34:00 - INFO - __main__ -   ***** Eval results  *****
12/13/2021 08:34:01 - INFO - __main__ -     perplexity = tensor(3.7451)


{'perplexity_': tensor(3.7451)}

# Test Chat

In [24]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
model = AutoModelWithLMHead.from_pretrained('output-medium')

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]



In [25]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("CryptoBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Hello
CryptoBot: Hello! How are you today?
>> User:Do you know bitcoin?
CryptoBot: Hi! Can you buy some ethereum this year? Would love that, thank  you!
>> User:No. I'm asking you if I should buy some bitcoin?
CryptoBot: Hello. How are YOU today? 😂
>> User:Is Elon Musk making dodge a scam?
CryptoBot: I’m very new to crypto. I bought a few hundred ADA last year for $700 and then it went to $700. Is it normal for cryptos to go up like that? I don’t think I’ve ever seen a dip like that. Is there something special?
>> User:What is ADA?
CryptoBot: Thank you for asking this question for those of us who are new and are also interested in the topic!


>> User:Hi <br>
CryptoBot: Haha I ain’t scared I’m keeping my crypto <br>
>> User:What is ethereum? <br>
CryptoBot: So is this like a paid vacation for them or what? <br>
>> User:What are you talking about? <br>
CryptoBot: And the cost of living is higher than the cost to produce. <br>
>> User:Which is the most promising cryptocurrency? <br>
CryptoBot: How do you figure? <br>
>> User:You tell me <br>
CryptoBot: !!!Remindme 18-09-2021 <br>

>> User:What do you think about DodgeCoin? <br>
CryptoBot: I’m not sure I understand your question. Are you saying I’ll be able to use my debit card to buy more crypto (gas fee or whatever), or that I can use my Coinbase debit card for that? <br>
>> User:What is bitcoin? <br>
CryptoBot: Yes, the original question asked if you could use it to buy a house or car. If you can use it for that then you’re fine. If not, then you have no idea what you‘re talking about. <br>
>> User:Why is bitcoin price keeping rising? <br>
CryptoBot: Why would you buy a used car with Bitcoin if you can just use it as a payment method? <br>
>> User:I don't understand <br>
CryptoBot: !!!Remindme 18-09-2021 <br>
>> User:You did not explain anything <br>
CryptoBot: !Reminder: you can actually use your Coinbase debit or SPEDN card to purchase DOGE on Coinbase. The only difference is that you need to use your Card to <br>


# Push model to HugginFace

Not working very well. I have to do it from a local cmd

In [26]:
!pwd

/content/drive/My Drive/reddit


In [27]:
!git clone https://huggingface.co/novusshu/DialoGPT-medium-crypto

Cloning into 'DialoGPT-medium-crypto'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.


In [33]:
os.chdir('DialoGPT-medium-crypto')

In [34]:
!pwd

/content/drive/My Drive/reddit/DialoGPT-medium-crypto


In [35]:
!cp ../output-medium/* ./ 

cp: -r not specified; omitting directory '../output-medium/checkpoint-10500'
cp: -r not specified; omitting directory '../output-medium/checkpoint-14000'
cp: -r not specified; omitting directory '../output-medium/checkpoint-17500'
cp: -r not specified; omitting directory '../output-medium/checkpoint-21000'
cp: -r not specified; omitting directory '../output-medium/checkpoint-3500'
cp: -r not specified; omitting directory '../output-medium/checkpoint-7000'


In [36]:
!sudo apt-get install git-lfs
!pip install huggingface_hub

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,622 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories c

In [37]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.
        (Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on you

In [None]:
!git config --global credential.helper store

In [38]:
!git lfs install
!git config --global user.email "novusshu@gmail.com"
!git config --global user.name "novus shu"

Updated git hooks.
Git LFS initialized.


In [39]:
!ls -al

total 1413284
-rw------- 1 root root        905 Dec 14 19:04 config.json
-rw------- 1 root root         28 Dec 14 19:04 eval_results.txt
drwx------ 9 root root       4096 Dec 14 19:05 .git
-rw------- 1 root root       1175 Dec 14 19:01 .gitattributes
-rw------- 1 root root     456356 Dec 14 19:04 merges.txt
-rw------- 1 root root 1444581337 Dec 14 19:04 pytorch_model.bin
-rw------- 1 root root        357 Dec 14 19:04 special_tokens_map.json
-rw------- 1 root root        617 Dec 14 19:04 tokenizer_config.json
-rw------- 1 root root    1355269 Dec 14 19:04 tokenizer.json
-rw------- 1 root root       1327 Dec 14 19:04 training_args.bin
-rw------- 1 root root     798156 Dec 14 19:04 vocab.json


In [40]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mconfig.json[m
	[31meval_results.txt[m
	[31mmerges.txt[m
	[31mpytorch_model.bin[m
	[31mspecial_tokens_map.json[m
	[31mtokenizer.json[m
	[31mtokenizer_config.json[m
	[31mtraining_args.bin[m
	[31mvocab.json[m

nothing added to commit but untracked files present (use "git add" to track)


In [41]:
!git add .
!git commit -m 'first'

fatal: cannot exec '.git/hooks/post-commit': Permission denied
[main ea2119a] first
 9 files changed, 50050 insertions(+)
 create mode 100644 config.json
 create mode 100644 eval_results.txt
 create mode 100644 merges.txt
 create mode 100644 pytorch_model.bin
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 training_args.bin
 create mode 100644 vocab.json


In [42]:
!git push

fatal: could not read Username for 'https://huggingface.co': No such device or address


In [None]:
# Tip: using the same email as your huggingface.co account will link your commits to your profile
MY_MODEL_NAME = 'DialoGPT-small-crypto'

In [None]:
with open('HuggingFace-API-key.txt', 'rt') as f:
  HUGGINGFACE_API_KEY = f.read().strip()
model.push_to_hub(MY_MODEL_NAME, use_auth_token=HUGGINGFACE_API_KEY)
tokenizer.push_to_hub(MY_MODEL_NAME, use_auth_token=HUGGINGFACE_API_KEY)

Several commits (3) will be pushed upstream.
The progress bars may be unreliable.


KeyboardInterrupt: ignored