## Load business classifier

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path='./huggingface_models/'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)


In [3]:
tokenizer

PreTrainedTokenizer(name_or_path='./huggingface_models/', vocab_size=50265, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [4]:
#model.config

### Test run on 1 sample

In [5]:
%%time
text = '''Copyright ©  All rights reserved.   |   Privacy PolicyThis Web page is parked FREE, courtesy of GoDaddy'''
text ='''This site is marked private by its owner. If you would like to view it, you’ll need two things:A WordPress.com account. Don’t have an account? All you need is an email address and password — register here!Permission from the site owner. Once you've created an account, log in and revisit this screen to request an invite.If you already have both of these, great!Log in here'''
text=''
inputs = tokenizer(text, truncation=True, 
                   return_tensors='pt')
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

CPU times: total: 422 ms
Wall time: 138 ms


'false'

#### Important: since we're runing inference (prediction) only, there's no need to keep track of the calculation for backprop
#### Disable it save us a lot of time

In [6]:
def predict_classes(text):
    inputs = tokenizer(text, truncation=True, return_tensors='pt')
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]

In [15]:
df = pd.read_csv('StagingDB_prediction_results.csv')

# Convert to empty cell(NaN) to '' (empty string), previously it was converted to nan, thus cause the model to predict as True, which is incorrect
# You can check it with df.iloc[16]
df = df.fillna('')
print(df.shape)
df.head()

(5401, 5)


Unnamed: 0,domain,revisions_text,unoptimized_ml_is_business,source,language
0,startenderexoticwear.com,"startenderexoticwear.comis parked free, courte...",False,Web Scraper,en
1,jonathandefriess.com,"I used to be a drug addict.I was 21, a crimina...",False,Web Scraper,en
2,mitchell-godwin.com,This site is under construction.,False,Web Scraper,en
3,ligersecurity.com,Providing bridge between local security servic...,True,Web Scraper,en
4,amelia-dudley.com,"Amelia is a New York based actress, working in...",True,Web Scraper,en


In [9]:
df[df['revisions_text']=='']

Unnamed: 0,domain,revisions_text,unoptimized_ml_is_business,source,language
16,homeappliancerepairgroup.com,,True,Web Scraper,en
88,mikesracing.com,,True,Web Scraper,en
1157,differently-normal.com,,False,Web Scraper,en
5139,nylonkink.com,,True,Web Scraper,en


In [14]:
df.iloc[16]

domain                        homeappliancerepairgroup.com
revisions_text                                            
unoptimized_ml_is_business                            True
source                                         Web Scraper
language                                                en
optimized_ml_is_business                             False
Name: 16, dtype: object

In [10]:
%%time
df['optimized_ml_is_business'] = df['revisions_text'][:100].apply(lambda x: True if predict_classes(x)=='true' else False)

CPU times: total: 1min 31s
Wall time: 27.3 s


In [11]:
df[:50][['domain', 'revisions_text', 'unoptimized_ml_is_business','optimized_ml_is_business','source','language']]

Unnamed: 0,domain,revisions_text,unoptimized_ml_is_business,optimized_ml_is_business,source,language
0,startenderexoticwear.com,"startenderexoticwear.comis parked free, courte...",False,False,Web Scraper,en
1,jonathandefriess.com,"I used to be a drug addict.I was 21, a crimina...",False,False,Web Scraper,en
2,mitchell-godwin.com,This site is under construction.,False,False,Web Scraper,en
3,ligersecurity.com,Providing bridge between local security servic...,True,True,Web Scraper,en
4,amelia-dudley.com,"Amelia is a New York based actress, working in...",True,True,Web Scraper,en
5,revotech-trading.com,NameLast modifiedSizeDescription cgi-bin/ ...,True,True,Web Scraper,en
6,vietnam68-69.com,"vietnam68-69.comis parked free, courtesy of Go...",False,False,Web Scraper,en
7,caspianmodernline.com,Come back soon to see what it's like.Build you...,False,False,Web Scraper,en
8,postalplanner.com,For information about this website please cont...,True,True,Web Scraper,en
9,julesandnomads.com,"julesandnomads.comis parked free, courtesy of ...",False,False,Web Scraper,en


### Sample