In [None]:
# The below code is an adaptation of the following Google Colab
# https://colab.research.google.com/drive/1ixOZTKLz4aAa-MtC6dy_sAvc9HujQmHN#scrollTo=XQ4beXIyHdYB

In [203]:
# Import libraries
import time
import ktrain
from ktrain import text
import pandas as pd
from sklearn.model_selection import train_test_split

In [141]:
# Import Data

my_dataframe=pd.read_csv('labeled2.csv')

In [142]:
# Formating the data

my_dataframe.drop(['#'], axis=1)
my_dataframe=my_dataframe.dropna(subset = ['label'])
my_dataframe["label"]=my_dataframe["label"].astype(int)
print(my_dataframe.shape)

(532, 3)
(532, 3)


In [145]:
#Splitting the data into a train and a text dataset

train, test = train_test_split(my_dataframe, test_size=0.2)
x_train = train.text.to_numpy()
y_train = train.label.to_numpy()
x_test = test.text.to_numpy()
y_test = test.label.to_numpy()

In [147]:
# Formating the data with ktrain to fit the expected input and output of a BERT model

(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                                                       x_test=x_test, y_test=y_test,
                                                                       class_names=["trash","autonomous","non_autonomous"],
                                                                       preprocess_mode='bert',
                                                                       maxlen=256, 
                                                                       max_features=35000)

preprocessing train...
language: en


preprocessing test...
language: en


In [148]:
# Building the model

model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6)

Is Multi-Label? False
maxlen is 256
done.


In [149]:
# Training the model

learner.fit_onecycle(2e-5, 4)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x2b31c83d128>

In [150]:
# Checking accuracy of results on the test dataset

learner.validate(val_data=(x_test, y_test), class_names=["trash","autonomous","non_autonomous"])

                precision    recall  f1-score   support

         trash       0.92      0.94      0.93        36
    autonomous       0.98      0.96      0.97        50
non_autonomous       0.95      0.95      0.95        21

     micro avg       0.95      0.95      0.95       107
     macro avg       0.95      0.95      0.95       107
  weighted avg       0.95      0.95      0.95       107



array([[34,  1,  1],
       [ 2, 48,  0],
       [ 1,  0, 20]], dtype=int64)

In [151]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [152]:
predictor.get_classes()

['trash', 'autonomous', 'non_autonomous']

In [153]:
print(test.text.tolist()[0])

We use them to give you the best experience. If you continue using our website, we'll assume that you are happy to receive all cookies on this website. With technology for unmanned ships developing rapidly, regulatory authorities are playing catch-up. A major new report commissioned by the Danish Maritime Authority (DMA) unpicks the challenges posed by autonomy on the seas and outlines how they can be overcome. Patrick Kingsland finds out more from Erik Tvedt, special adviser at the DMA. 


In [154]:
predictor.predict(test.text.tolist()[0])

'non_autonomous'

In [155]:
# we predicted the correct label
print(test.label.tolist()[0])

2


In [158]:
# let's save the predictor for later use
predictor.save('/tmp/my_predictor')

In [159]:
# reload the predictor
reloaded_predictor = ktrain.load_predictor('/tmp/my_predictor')

In [168]:
# Now let us use the predictor to filter trash from the rest of the Dataset provided by Yuriy
my_dataframe=pd.read_csv('20191209233601.19044.gkg.Labeled.txt',delimiter="\t")

In [170]:
my_dataframe.columns

Index(['Unnamed: 0', 'Date', 'NumArticles', 'Counts', 'Themes', 'Locations',
       'Persons', 'Organizations', 'ToneData', 'CAMEOEvents', 'Sources',
       'SourceURLs', 'text', 'label'],
      dtype='object')

In [198]:
my_dataframe[my_dataframe.label==0]["text"].dropna().tolist()[8]

"The ride-hailing company announces a collaboration with North America's largest auto parts supplier to develop and manufacture self-driving cars. Lyft is developing self-driving cars at its Silicon Valley engineering facility called Level 5. Lyft is forging full-force ahead with its self-driving car technology.  On Wednesday, the ride-hailing company said it had formed a partnership with Magna, the largest auto parts supplier in North America. Under the collaboration, the companies will co-fund, jointly develop and manufacture autonomous vehicle systems. The goal is to create self-driving technology that's available to all car manufacturers. "

In [193]:
reloaded_predictor.predict(my_dataframe[my_dataframe.label==0]["text"].dropna().tolist()[9])

'autonomous'

In [214]:
len(my_dataframe[my_dataframe.label==0]["text"].dropna().tolist())

2669

In [212]:
i=0
j=0
trash=[]
starttime=time.time()
while i <100:
    if reloaded_predictor.predict(my_dataframe[my_dataframe.label==0]["text"].dropna().tolist()[j])=="trash":
        trash.append(my_dataframe[my_dataframe.label==0]["text"].dropna().tolist()[j])
        j=j+1
        i=i+1
    else:
        j=j+1
endtime=time.time()

In [211]:
trash

['_________________________________________________________\xa0  leider ist ein Fehler          aufgetreten. Der Zugriff auf die angeforderte Datei wurde verweigert. Haben Sie sich vielleicht          vertippt oder eine alte URL aufgerufen? Wenn nicht, informieren Sie bitte          den Webmaster dieser Homepage per Email. Um zu der vorherigen Seite zurückzukehren,          verwenden Sie bitte einfach die "Zurück" - Taste          Ihres Browsers.  _________________________________________________________',
 '\xa0        More from this state at: Minnesota Ag Connection',
 'To find something you’ll like, click a category above or use the search box.',
 "Let friends in your social network know what you are reading about Up for debate: cracked tail lights, tinted windows and tire pressure monitoring. A link has been sent to your friend's email address. To find out more about Facebook commenting please read the Conversation Guidelines and FAQs                                              We

In [213]:
print("Bert filter went throught: {} text and detected {} trash in {} seconds ".format(j,i,endtime-starttime))

Bert filter went throught: 274 text and detected 100 trash in 10.132163286209106 seconds 
