In [1]:
# imports modules
from src.paths import LOCAL_RAW_DATA_PATH, LOCAL_PROCESSED_DATA_PATH, LOCAL_MODELS_PATH
from tqdm._tqdm_notebook import tqdm_notebook
from transformers import pipeline
import pandas as pd
import xgboost
import pickle

tqdm_notebook.pandas()

# loads data
df_path = LOCAL_PROCESSED_DATA_PATH / 'pretrain_dataset_20211013.pkl'
df = pd.read_pickle(df_path)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
# loads model
model = pipeline('zero-shot-classification')

In [6]:
# tests model
text = 'I wanna take a cab at 8 pm'
labels = ['Taxi', 'Appointment', 'Trip', 'Game']
model(text, labels, multi_label=True)

{'sequence': 'I wanna take a cab at 8 pm',
 'labels': ['Taxi', 'Trip', 'Game', 'Appointment'],
 'scores': [0.8777869939804077,
  0.6453558802604675,
  0.05611748248338699,
  0.04601442068815231]}

In [7]:
# sets columns width
pd.set_option('max_colwidth', 100) 

In [8]:
sample = df.full_text.sample(15, random_state=0)
sample

31069     RT @ggreenwald: Please, please avoid having dinner with your family on Thanksgiving.\n\nStay at ...
64870                                                     @TomBilyeu @MoralisWeb3 which ones do you watch? :)
66644                                                                      @DanaAela Ok.\n\nWhy are you here?
76115                                                                        @nicholasilechie Sarcasm is hard
78109                                                         Are we in bizarro world https://t.co/CyukYsaqco
44310                                                                                       @tottycoys 🙏🙏🙏🙏🙏🙏
23336                     @A_Sovereign_Man @MichaelRippe @rekodi_i Yes, that is what I am. Give me a break. 🙄
109491    "Bitcoin could have already facilitated billions of dollars worth of censorship-resistant value ...
126286                                                                       @LHeilpern @CFN_network Crush it
67837     

In [10]:
txt = sample[109491]
print(txt)

"Bitcoin could have already facilitated billions of dollars worth of censorship-resistant value transfer to and from Venezuela."

"Bitcoin has already played a part in changing the destiny of an entire country."

Groundbreaking research by @MattAhlborg 
https://t.co/EdGguqxCMs


In [11]:
labels = [
    'Crypto',
    'Bitcoin',
]

In [12]:
# tests model
model(txt, labels, multi_label=True)

{'sequence': '"Bitcoin could have already facilitated billions of dollars worth of censorship-resistant value transfer to and from Venezuela."\n\n"Bitcoin has already played a part in changing the destiny of an entire country."\n\nGroundbreaking research by @MattAhlborg \nhttps://t.co/EdGguqxCMs',
 'labels': ['Bitcoin', 'Crypto'],
 'scores': [0.9668909311294556, 0.35959258675575256]}

In [17]:
zsc_classes = model(df.full_text.to_list(), labels, multi_label=True)

[{'sequence': 'RT @SpaceX: Crew Dragon and Falcon 9 in the hangar at Launch Complex 39A ahead of launching four astronauts to the @space_station; liftoff…',
  'labels': ['Crypto', 'Bitcoin'],
  'scores': [0.004856843501329422, 0.0013822255423292518]},
 {'sequence': '@SoleimanWes Ummmmmmmmmmmmmmmm',
  'labels': ['Crypto', 'Bitcoin'],
  'scores': [0.07035194337368011, 0.012568430975079536]},
 {'sequence': '@zooko Bitcoin maximalism has no live players.\n\nhttps://t.co/ZcMHZO1iLQ',
  'labels': ['Bitcoin', 'Crypto'],
  'scores': [0.9714718461036682, 0.7016560435295105]},
 {'sequence': 'BNB will get a breakout soon and ETH is getting closer to my target!\n\nWatch my video for my next trade: https://t.co/CpBHRuMNaW',
  'labels': ['Crypto', 'Bitcoin'],
  'scores': [0.6623146533966064, 0.000379884266294539]}]

In [18]:
# df['zsc_classes'] = df.full_text.progress_apply(lambda x: model(x, labels, multi_label=True))