In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

In [4]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')
df

In [5]:
df = df[['text', 'target']]
df = df.rename(columns={'target':'label'})
df = df[['label', 'text']]
df

In [6]:
X = list(df['text'])
y = list(df['label'])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [9]:
train_encodings = tokenizer(X_train,
                            truncation=True,
                            padding=True)

val_encodings = tokenizer(X_val,
                            truncation=True,
                            padding=True)

In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val))

In [11]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [13]:
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

In [14]:
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

In [15]:
model.save_pretrained("MSF_DistilBERT_CustomModel")

In [16]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("MSF_DistilBERT_CustomModel")

In [17]:
test_sentence = "horrible earthquake, people are dying!"

In [18]:
predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

In [19]:
tf_output = loaded_model.predict(predict_input)[0]

In [20]:
tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
tf_prediction

In [21]:
df = pd.read_csv('../input/nlp-getting-started/test.csv')
data = list(df['text'])

In [22]:
results = []
for txt in data:
    tokenized_input = tokenizer.encode(txt,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
    preds = loaded_model.predict(tokenized_input)
    proba = tf.math.softmax(preds.logits, axis=-1)
    label = proba.numpy()
    results.append(label.argmax())

In [25]:
sub = pd.DataFrame(np.column_stack((list(df['id']), results)), columns=["id", "target"])

In [26]:
sub.to_csv("submission.csv", index=False)