## Creating Text Classifier




In [None]:
# Installing development version of Transformers 
!pip install transformers[sentencepiece]

In [None]:
# Installing huggingface_hub
!pip install huggingface_hub

In [None]:
# Login to Hugging Face with a given token
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Installing datasets
!pip install datasets

In [None]:
# Loading dataset from datasets
from datasets import load_dataset
imdb = load_dataset("imdb")

In [None]:
# Generate a tokenizer from pretrained DistilBERT model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [None]:
# Using DataCollatorWithPadding to create a batch of examples
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Generate a model from pretrained DistilBERT model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
# Defining the training hyperparameters in TrainingArguments and pass the training arguments to Trainer
training_args = TrainingArguments(
    output_dir="./output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Calling train() to fine-tune model
trainer.train()

In [None]:
# Loading the model to Hugging Face hub
trainer.push_to_hub()

In [None]:
# After this step, I could try the classifier with example sentences over the hugging face.

In [None]:
# I loaded my model from Hugging Face 
model_loaded = AutoModelForSequenceClassification.from_pretrained("MelikeDulkadir/output")

## Test the classifier on the imdb test set and examine the results

In [None]:
predict_dataset = tokenized_imdb["test"]

In [None]:
predict_dataset = predict_dataset.remove_columns("label")

In [None]:
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions

In [None]:
imdb_predicted_labels = predictions.argmax(axis=1)

In [None]:
imdb_predicted_labels

In [None]:
imdb_labels = imdb["test"]["label"]

In [None]:
# Review metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Test Accuracy : {}".format(accuracy_score(imdb_labels,imdb_predicted_labels)))
print("\nClassification Report : ")
print(classification_report(imdb_labels, imdb_predicted_labels))
print(confusion_matrix(imdb_labels, imdb_predicted_labels))

# Testing text classifier with coronaTweets dataset


In [None]:
# Loading coronaTweets dataset to Hugging Face and then calling load_dataset
coronaTweets = load_dataset("MelikeDulkadir/coronaTweets")

In [None]:
coronaTweets

In [None]:
# Convert dataset to pandas dataframe and dropping rows that Sentiment value equal to Neutral 
import pandas as pd
df = pd.DataFrame(coronaTweets['test'])
df = df[df["Sentiment"] != 'Neutral']
df.shape

In [None]:
# Then converting dataset
from datasets import Dataset
corona_tweets = Dataset.from_pandas(df)

In [None]:
# Selecting only tweet texts for testing classifier model
corona_tweets = corona_tweets.remove_columns(['UserName', 'ScreenName', 'Location', 'TweetAt','Sentiment'])

In [None]:
corona_tweets = corona_tweets.remove_columns("__index_level_0__")

In [None]:
# Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length
def preprocess_function(examples):
    return tokenizer(examples["OriginalTweet"], truncation=True)

In [None]:
tokenized_coronaTweets = corona_tweets.map(preprocess_function, batched=True)

In [None]:
tokenized_coronaTweets

In [None]:
trainer.evaluate(tokenized_coronaTweets)

Test the classifier on the coronaTweets test set and examine the results

In [None]:
predicts = trainer.predict(tokenized_coronaTweets, metric_key_prefix="predict").predictions

In [None]:
predicts

In [None]:
predicted_labels = predicts.argmax(axis=1)

In [None]:
predicted_labels

In [None]:
# In order to adapt the values in the sentiment column in the coronaTweets dataset to the classifier, 
# I put the Extremely Negative or Negative values as 0, and the Extremely Positive or Positive values as 1 in a new list.
true_vals = []
for i in range(len(coronaTweets["test"]['Sentiment'])):
  if (coronaTweets["test"]['Sentiment'][i] == "Extremely Negative" or coronaTweets["test"]['Sentiment'][i] == "Negative"):
    true_vals.append(0)
  elif(coronaTweets["test"]['Sentiment'][i] == "Extremely Positive" or coronaTweets["test"]['Sentiment'][i] == "Positive"):
    true_vals.append(1)

In [None]:
# Review metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Test Accuracy : {}".format(accuracy_score(true_vals,predicted_labels)))
print("\nClassification Report : ")
print(classification_report(true_vals, predicted_labels))
print(confusion_matrix(true_vals, predicted_labels))

# Testing text classifier with coronaTweets dataset


In [146]:
# Loading coronaTweets dataset to Hugging Face and then calling load_dataset
coronaTweets = load_dataset("MelikeDulkadir/coronaTweets")



  0%|          | 0/1 [00:00<?, ?it/s]

In [147]:
coronaTweets

DatasetDict({
    test: Dataset({
        features: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment'],
        num_rows: 3798
    })
})

In [148]:
# Convert dataset to pandas dataframe and dropping rows that Sentiment value equal to Neutral 
import pandas as pd
df = pd.DataFrame(coronaTweets['test'])
df = df[df["Sentiment"] != 'Neutral']
df.shape

(3179, 6)

In [149]:
# Then converting dataset
from datasets import Dataset
corona_tweets = Dataset.from_pandas(df)

In [150]:
# Selecting only tweet texts for testing classifier model
corona_tweets = corona_tweets.remove_columns(['UserName', 'ScreenName', 'Location', 'TweetAt','Sentiment'])

In [162]:
corona_tweets = corona_tweets.remove_columns("__index_level_0__")

In [163]:
# Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length
def preprocess_function(examples):
    return tokenizer(examples["OriginalTweet"], truncation=True)

In [164]:
tokenized_coronaTweets = corona_tweets.map(preprocess_function, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [165]:
tokenized_coronaTweets

Dataset({
    features: ['OriginalTweet', 'input_ids', 'attention_mask'],
    num_rows: 3179
})

In [167]:
trainer.evaluate(tokenized_coronaTweets)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: OriginalTweet. If OriginalTweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3179
  Batch size = 16


Step,Training Loss,Validation Loss
43,No log,No log
43,No log,No log
43,No log,No log
43,No log,No log
43,No log,No log


{}

Test the classifier on the coronaTweets test set and examine the results

In [169]:
predicts = trainer.predict(tokenized_coronaTweets, metric_key_prefix="predict").predictions

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: OriginalTweet. If OriginalTweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3179
  Batch size = 16


Step,Training Loss,Validation Loss
43,No log,No log
43,No log,No log
43,No log,No log
43,No log,No log
43,No log,No log


In [170]:
predicts

array([[ 0.13415053, -0.00633375],
       [ 0.25044957, -0.13941324],
       [-0.01944779,  0.19238164],
       ...,
       [ 0.13651761,  0.01015171],
       [ 0.3278059 , -0.28571594],
       [-0.1857457 ,  0.29239544]], dtype=float32)

In [171]:
predicted_labels = predicts.argmax(axis=1)

In [172]:
predicted_labels

array([0, 0, 1, ..., 0, 0, 1])

In [173]:
# In order to adapt the values in the sentiment column in the coronaTweets dataset to the classifier, 
# I put the Extremely Negative or Negative values as 0, and the Extremely Positive or Positive values as 1 in a new list.
true_vals = []
for i in range(len(coronaTweets["test"]['Sentiment'])):
  if (coronaTweets["test"]['Sentiment'][i] == "Extremely Negative" or coronaTweets["test"]['Sentiment'][i] == "Negative"):
    true_vals.append(0)
  elif(coronaTweets["test"]['Sentiment'][i] == "Extremely Positive" or coronaTweets["test"]['Sentiment'][i] == "Positive"):
    true_vals.append(1)

In [174]:
# Review metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Test Accuracy : {}".format(accuracy_score(true_vals,predicted_labels)))
print("\nClassification Report : ")
print(classification_report(true_vals, predicted_labels))
print(confusion_matrix(true_vals, predicted_labels))

Test Accuracy : 0.6391947153192828

Classification Report : 
              precision    recall  f1-score   support

           0       0.60      0.89      0.72      1633
           1       0.76      0.38      0.50      1546

    accuracy                           0.64      3179
   macro avg       0.68      0.63      0.61      3179
weighted avg       0.68      0.64      0.61      3179

[[1449  184]
 [ 963  583]]
