In [1]:
# Use GPU option

# In Colab, go to Edit/Notebook Settings and choose the 'GPU' option before running this script

In [2]:
# Load data

# there are several ways to load data into Colab

# 1. Host your data to GitHub (up to 25MB) and use the url to the GitHub page
# e.g. df = pd.read_csv('https://raw.githubusercontent.com/junwang4/causal-language-use-in-science/master/data/pubmed_causal_language_use.csv') 

# 2. Host your data in your Google drive and then mount to your Google drive. You will be given an authorization code to finish the process
# e.g. the following code
# from google.colab import drive
# drive.mount('/drive')
# df = pd.read_csv('/drive/My Drive/train.tsv', sep='\t')

# 3. Upload your data to Colab Files. The uploaded file will be deleted when the session is disconnected. You will need to upload again after re-connect.
# e.g.   df = pd.read_csv('train.tsv', sep='\t')
# In this script we use method # 3

# We will use the Kaggle sentiment classification data
# https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

In [3]:
import pandas as pd
from sklearn.metrics import f1_score

In [4]:
df = pd.read_csv('for_bert_eq_data_hur.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,tokenized_text,Subjectivity,polarity,nb_pronouns,nb_numerals,nb_exclm,nb_ques,count_modal_verbs,count_wh_words,nb_slang,label
0,0,Prayers for Cabo: Hurricane Odile Roars Throug...,"['Prayers', 'Cabo', ':', 'Hurricane', 'Odile',...",0.0,0.0,0,0,0,0,0,0,0,sympathy_and_emotional_support
1,1,Sammy Hagar's Home Damaged in Hurricane: Sammy...,"['Sammy', 'Hagar', ""'s"", 'Home', 'Damaged', 'H...",0.0,0.0,0,0,0,0,0,0,0,infrastructure_and_utilities_damage
2,2,Residents Tourists Sent to Shelters as Hurrica...,"['Residents', 'Tourists', 'Sent', 'Shelters', ...",0.0,0.0,0,0,0,0,0,0,0,displaced_people_and_evacuations
3,3,Our thoughts go out to all of our friends in L...,"['Our', 'thoughts', 'go', 'friends', 'Los', 'C...",0.5,0.5,2,0,0,0,0,0,0,sympathy_and_emotional_support
4,4,Does anyone have information on emergency serv...,"['Does', 'anyone', 'information', 'emergency',...",0.0,0.0,0,0,0,1,0,0,0,other_useful_information


In [6]:
# prepare train and test data
# since fine tuning a BERT model still requires a significant amount of time, 
# only 1000 training examples and 1000 test examples will be used for demo purpose.
# prior experiment shows LinearSVC's best accuracy (3-fold CV) is about 62-65% depending on vectorization options
# BERT should be able to outperform LinearSVC with far fewer training examples.

dff = df.sample(frac=1)
#train_size = int(0.01 * len(dff))
#df_train = dff[:train_size]
#df_test = dff[train_size:]
df_train = dff[:1000]
df_test = dff[-1000:]
print(df_train.shape)
print(df_test.shape)
X_train, y_train = df_train['tweet_text'].values, df_train['label'].values
X_test, y_test = df_test['tweet_text'].values, df_test['label'].values

(1000, 13)
(1000, 13)


In [7]:
# check the category distribution in the train and test set

import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)))
unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)))


[['caution_and_advice' 'displaced_people_and_evacuations'
  'donation_needs_or_offers_or_volunteering_services'
  'infrastructure_and_utilities_damage' 'injured_or_dead_people'
  'missing_trapped_or_found_people' 'not_related_or_irrelevant'
  'other_useful_information' 'sympathy_and_emotional_support']
 [93 56 118 148 37 10 139 299 100]]
[['caution_and_advice' 'displaced_people_and_evacuations'
  'donation_needs_or_offers_or_volunteering_services'
  'infrastructure_and_utilities_damage' 'injured_or_dead_people'
  'missing_trapped_or_found_people' 'not_related_or_irrelevant'
  'other_useful_information' 'sympathy_and_emotional_support']
 [84 52 129 136 41 10 165 283 100]]


In [8]:
# build a LinearSVC model as a baseline comparison to the BERT model
# since LinearSVC is a linear model, we can print out its top features in each category to see whether the model learned something meaningful
# in this example, the top 10 features for the "very negative" category (category 0) is printed out
# you can see that some top features are not so negative
# with only 1000 training examples, LinearSVC achieved 0.529 accuracy on the 1000 test examples.

from sklearn.feature_extraction.text import CountVectorizer
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=2, stop_words='english')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=1, max_iter=2000)
svm_clf.fit(X_train_vec,y_train)
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))
top_10 = feature_ranks[-10:]
print("Top 10 words")
for i in range(0, len(top_10)):
    print(top_10[i])
print()

Top 10 words
(0.8845569252695241, 'forecast')
(0.8905853746801515, 'advisory')
(0.891874144974017, 'trying')
(0.9554990382191825, 'km')
(0.9855673235059087, 'herald')
(1.0036241581222614, 'monsoon')
(1.0607096526274558, 'alert')
(1.1766360905624793, 'issued')
(1.2587788979086678, 'extreme')



In [9]:
# to save some time, we are using a hold-out test to compare the LinearSVC and BERT models
# you can also try cross validation
# LinearSVC test accuracy
X_test_vec = unigram_count_vectorizer.transform(X_test)
y_pred = svm_clf.predict(X_test_vec)
svm_clf.score(X_test_vec,y_test)

0.573

In [10]:
# you can also try cross validation on the train set
# the result should not be far from the hold-out test
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
svm_pipe = Pipeline([('vect', unigram_count_vectorizer),('svm',LinearSVC(dual=True, max_iter=2000))])
scores = cross_val_score(svm_pipe,X_train,y_train,cv=3)
print(sum(scores)/len(scores))

0.5640280999562437


In [11]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[[ 29   2   3   2   0   0   7  39   2]
 [  1  29   6   4   0   0   2  10   0]
 [  0   6  80   8   0   1  15  13   6]
 [  1   1   6  89   1   0   5  32   1]
 [  0   3   2   4  19   0   1  12   0]
 [  0   0   1   1   0   0   1   3   4]
 [  3   1   4   5   0   0 107  44   1]
 [ 26   3  12  34   1   1  40 155  11]
 [  2   0   5   5   0   0  13  10  65]]

                                                   precision    recall  f1-score   support

                               caution_and_advice       0.47      0.35      0.40        84
                 displaced_people_and_evacuations       0.64      0.56      0.60        52
donation_needs_or_offers_or_volunteering_services       0.67      0.62      0.65       129
              infrastructure_and_utilities_damage       0.59      0.65      0.62       136
                           injured_or_dead_people       0.90      0.46      0.61        41
                  missing_trapped_or_found_people       0.00      0.00      0.00        10
         

In [12]:
# LinearSVC error analysis
# print out errors to check if any common patterns for further model improvement

# print out very positive examples that were predicted as negative
# you can print out different types of errors 
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

errors: 0


In [13]:
# install BERT sklearn wrapper written by charles9n
# check out the github page for fine tuning options and usage
# https://github.com/charles9n/bert-sklearn

!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 259 (delta 3), reused 3 (delta 0), pack-reused 247[K
Receiving objects: 100% (259/259), 519.36 KiB | 19.97 MiB/s, done.
Resolving deltas: 100% (125/125), done.
Processing /content/bert-sklearn
Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/59/7f/4ade91fbb684c6f28a6e56028d9f9d2de4297761850d083579779f07c0de/boto3-1.16.25-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 25.0MB/s 
Collecting botocore<1.20.0,>=1.19.25
[?25l  Downloading https://files.pythonhosted.org/packages/ef/d5/c0c33ca15e31062220ac5964f3492409eaf90a5cf5399503cd8264f2f8e9/botocore-1.19.25-py2.py3-none-any.whl (6.9MB)
[K     |████████████████████████████████| 6.9MB 15.9MB/s 
[?25hCollecting s3transfer<0.4.0,>=0.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/6

In [14]:
# fine tune a BERT base uncased model
# since this wrapper has included vectorization using word embedding, no need to vectorize like in LinearSVC
# first the pre-trained BERT model will be loaded in
# then the training starts. 90% examples will be used as training examples and the other 10% as validation (parameter tuning)
# default setting is 3 epoch. Each epoch takes in some training data
from bert_sklearn import BertClassifier
model = BertClassifier()         # text/text pair classification
print(model)
model.fit(X_train, y_train)

100%|██████████| 231508/231508 [00:00<00:00, 16485542.36B/s]

Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)
Loading bert-base-uncased model...



100%|██████████| 440473133/440473133 [00:05<00:00, 84353269.16B/s]
100%|██████████| 433/433 [00:00<00:00, 91465.23B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 900, validation data size: 100



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=29.0, style=ProgressStyle(description_wi…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)





HBox(children=(FloatProgress(value=0.0, description='Validating', max=13.0, style=ProgressStyle(description_wi…



Epoch 1, Train loss: 1.8493, Val loss: 1.5482, Val accy: 42.00%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=29.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=13.0, style=ProgressStyle(description_wi…



Epoch 2, Train loss: 1.2381, Val loss: 1.1376, Val accy: 62.00%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=29.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=13.0, style=ProgressStyle(description_wi…



Epoch 3, Train loss: 0.7940, Val loss: 1.0419, Val accy: 67.00%



BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=True, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None,
               label_list=array(['caution_and_advice', 'displaced_people_and_evacuations',
       'donation_needs_or_offers_or_volunteering_services',
       'infrastructure_and_u...
       'missing_trapped_or_found_people', 'not_related_or_irrelevant',
       'other_useful_information', 'sympathy_and_emotional_support'],
      dtype=object),
               learning_rate=2e-05, local_rank=-1, logfile='bert_sklearn.log',
               loss_scale=0, max_seq_length=128, num_mlp_hiddens=500,
               num_mlp_layers=0, random_state=42, restore_file=None,
               train_batch_size=32, use_cuda=True, validation_fraction=0.1,
               warmup_proportion=0.1)

In [15]:
model.save('bert-sentiment.model')

In [16]:
model.score(X_test, y_test)

HBox(children=(FloatProgress(value=0.0, description='Testing', max=125.0, style=ProgressStyle(description_widt…



Loss: 0.9354, Accuracy: 70.00%


70.0

In [18]:
y_pred = model.predict(X_test)
f1_score(y_pred, y_test, average='weighted')

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=125.0, style=ProgressStyle(description_w…




0.7161055599102547

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# BERT error analysis
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,X_train,y_train,cv=3)
print(sum(scores)/len(scores))