In [6]:
# Use GPU option

# In Colab, go to Edit/Notebook Settings and choose the 'GPU' option before running this script

In [7]:
# Load data

# there are several ways to load data into Colab

# 1. Host your data to GitHub (up to 25MB) and use the url to the GitHub page
# e.g. df = pd.read_csv('https://raw.githubusercontent.com/junwang4/causal-language-use-in-science/master/data/pubmed_causal_language_use.csv') 

# 2. Host your data in your Google drive and then mount to your Google drive. You will be given an authorization code to finish the process
# e.g. the following code
# from google.colab import drive
# drive.mount('/drive')
# df = pd.read_csv('/drive/My Drive/train.tsv', sep='\t')

# 3. Upload your data to Colab Files. The uploaded file will be deleted when the session is disconnected. You will need to upload again after re-connect.
# e.g.   df = pd.read_csv('train.tsv', sep='\t')
# In this script we use method # 3

# We will use the Kaggle sentiment classification data
# https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

In [15]:
import pandas as pd
from sklearn.metrics import f1_score

In [9]:
df = pd.read_csv('for_bert_eq_data.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,tokenized_text,Subjectivity,polarity,nb_pronouns,nb_numerals,nb_exclm,nb_ques,count_modal_verbs,count_wh_words,nb_slang,label
0,0,RT @divyaconnects: Reached #Kathmandu finally!...,"['RT', '@', 'divyaconnects', ':', 'Reached', '...",1.0,0.0,0,0,1,0,0,0,0,other_useful_information
1,1,fears for Foreigners missing in Nepal earthqua...,"['fears', 'Foreigners', 'missing', 'Nepal', 'e...",0.05,-0.2,0,0,0,0,0,0,0,missing_trapped_or_found_people
2,2,RT @ParisBurned: 3700 people dead is absolutel...,"['RT', '@', 'ParisBurned', ':', '3700', 'peopl...",0.7,-0.6,0,1,0,0,0,0,0,injured_or_dead_people
3,3,Earthquake in Nepal - Please help Kapil #crowd...,"['Earthquake', 'Nepal', '-', 'Please', 'help',...",0.0,0.0,0,0,0,0,0,0,0,donation_needs_or_offers_or_volunteering_services
4,4,Nepal’s Slowing Economy Set for Freefall Witho...,"['Nepal', '’', 'Slowing', 'Economy', 'Set', 'F...",0.0,0.0,0,0,0,0,0,0,0,other_useful_information


In [11]:
# prepare train and test data
# since fine tuning a BERT model still requires a significant amount of time, 
# only 1000 training examples and 1000 test examples will be used for demo purpose.
# prior experiment shows LinearSVC's best accuracy (3-fold CV) is about 62-65% depending on vectorization options
# BERT should be able to outperform LinearSVC with far fewer training examples.

dff = df.sample(frac=1)
#train_size = int(0.01 * len(dff))
#df_train = dff[:train_size]
#df_test = dff[train_size:]
df_train = dff[:1000]
df_test = dff[-1000:]
print(df_train.shape)
print(df_test.shape)
X_train, y_train = df_train['tweet_text'].values, df_train['label'].values
X_test, y_test = df_test['tweet_text'].values, df_test['label'].values

(1000, 13)
(1000, 13)


In [12]:
# check the category distribution in the train and test set

import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)))
unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)))


[['caution_and_advice' 'displaced_people_and_evacuations'
  'donation_needs_or_offers_or_volunteering_services'
  'infrastructure_and_utilities_damage' 'injured_or_dead_people'
  'missing_trapped_or_found_people' 'not_related_or_irrelevant'
  'other_useful_information' 'sympathy_and_emotional_support']
 [44 20 136 70 95 30 93 329 183]]
[['caution_and_advice' 'displaced_people_and_evacuations'
  'donation_needs_or_offers_or_volunteering_services'
  'infrastructure_and_utilities_damage' 'injured_or_dead_people'
  'missing_trapped_or_found_people' 'not_related_or_irrelevant'
  'other_useful_information' 'sympathy_and_emotional_support']
 [50 26 170 66 104 34 85 299 166]]


In [13]:
# build a LinearSVC model as a baseline comparison to the BERT model
# since LinearSVC is a linear model, we can print out its top features in each category to see whether the model learned something meaningful
# in this example, the top 10 features for the "very negative" category (category 0) is printed out
# you can see that some top features are not so negative
# with only 1000 training examples, LinearSVC achieved 0.529 accuracy on the 1000 test examples.

from sklearn.feature_extraction.text import CountVectorizer
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=2, stop_words='english')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=1, max_iter=2000)
svm_clf.fit(X_train_vec,y_train)
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))
top_10 = feature_ranks[-10:]
print("Top 10 words")
for i in range(0, len(top_10)):
    print(top_10[i])
print()

Top 10 words
(0.5711115590112333, 'update')
(0.6067540382533995, 'aftershocks')
(0.6151300882004488, 'latest')
(0.6190322234571626, 'year')
(0.6399568615414959, 'canyon')
(0.6399568615414959, 'diablo')
(0.6630180985349352, 'peru')
(0.7543568109292037, 'alert')
(1.061546667662318, 'measuring')



In [17]:
# to save some time, we are using a hold-out test to compare the LinearSVC and BERT models
# you can also try cross validation
# LinearSVC test accuracy
X_test_vec = unigram_count_vectorizer.transform(X_test)
y_pred = svm_clf.predict(X_test_vec)
print(svm_clf.score(X_test_vec,y_test))
f1_score(y_test, y_pred, average='weighted')

0.631


0.6234671633306338

In [14]:
# you can also try cross validation on the train set
# the result should not be far from the hold-out test
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
svm_pipe = Pipeline([('vect', unigram_count_vectorizer),('svm',LinearSVC(dual=True, max_iter=2000))])
scores = cross_val_score(svm_pipe,X_train,y_train,cv=3)
print(sum(scores)/len(scores))

0.6040171908435381


In [17]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[[ 19   2   3   1   1   0   1  30   0]
 [  5   3   4   0   1   1   5   9   1]
 [  0   0  93   0   1   0   6  27   9]
 [  0   0   1  35   3   0   2  26   3]
 [  2   0   0   5  74   0   1  12   4]
 [  0   0   4   0   0   4   2   9   4]
 [  1   0  10   3   1   0  51  22  10]
 [ 19   0  21  11  11   1  25 226  15]
 [  2   0   9   0   2   1  17  13 116]]

                                                   precision    recall  f1-score   support

                               caution_and_advice       0.40      0.33      0.36        57
                 displaced_people_and_evacuations       0.60      0.10      0.18        29
donation_needs_or_offers_or_volunteering_services       0.64      0.68      0.66       136
              infrastructure_and_utilities_damage       0.64      0.50      0.56        70
                           injured_or_dead_people       0.79      0.76      0.77        98
                  missing_trapped_or_found_people       0.57      0.17      0.27        23
         

In [18]:
# LinearSVC error analysis
# print out errors to check if any common patterns for further model improvement

# print out very positive examples that were predicted as negative
# you can print out different types of errors 
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

errors: 0


In [19]:
# install BERT sklearn wrapper written by charles9n
# check out the github page for fine tuning options and usage
# https://github.com/charles9n/bert-sklearn

!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 259 (delta 3), reused 3 (delta 0), pack-reused 247[K
Receiving objects: 100% (259/259), 519.36 KiB | 14.43 MiB/s, done.
Resolving deltas: 100% (125/125), done.
Processing /content/bert-sklearn
Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/59/7f/4ade91fbb684c6f28a6e56028d9f9d2de4297761850d083579779f07c0de/boto3-1.16.25-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 7.9MB/s 
Collecting jmespath<1.0.0,>=0.7.1
  Downloading https://files.pythonhosted.org/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl
Collecting s3transfer<0.4.0,>=0.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/69/79/e6afb3d8b0b4e96cefbdc690f741d7dd24547ff1f94240c997a26fa908d3/s3transfer-0.3.3-p

In [20]:
# fine tune a BERT base uncased model
# since this wrapper has included vectorization using word embedding, no need to vectorize like in LinearSVC
# first the pre-trained BERT model will be loaded in
# then the training starts. 90% examples will be used as training examples and the other 10% as validation (parameter tuning)
# default setting is 3 epoch. Each epoch takes in some training data
from bert_sklearn import BertClassifier
model = BertClassifier()         # text/text pair classification
print(model)
model.fit(X_train, y_train)

Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)


100%|██████████| 231508/231508 [00:00<00:00, 878279.98B/s]


Loading bert-base-uncased model...


100%|██████████| 440473133/440473133 [00:14<00:00, 31013196.43B/s]
100%|██████████| 433/433 [00:00<00:00, 163541.97B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 900, validation data size: 100



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=29.0, style=ProgressStyle(description_wi…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)





HBox(children=(FloatProgress(value=0.0, description='Validating', max=13.0, style=ProgressStyle(description_wi…



Epoch 1, Train loss: 1.8158, Val loss: 1.3975, Val accy: 53.00%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=29.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=13.0, style=ProgressStyle(description_wi…



Epoch 2, Train loss: 1.0573, Val loss: 1.2074, Val accy: 59.00%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=29.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=13.0, style=ProgressStyle(description_wi…



Epoch 3, Train loss: 0.7325, Val loss: 1.0835, Val accy: 69.00%



BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=True, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None,
               label_list=array(['caution_and_advice', 'displaced_people_and_evacuations',
       'donation_needs_or_offers_or_volunteering_services',
       'infrastructure_and_u...
       'missing_trapped_or_found_people', 'not_related_or_irrelevant',
       'other_useful_information', 'sympathy_and_emotional_support'],
      dtype=object),
               learning_rate=2e-05, local_rank=-1, logfile='bert_sklearn.log',
               loss_scale=0, max_seq_length=128, num_mlp_hiddens=500,
               num_mlp_layers=0, random_state=42, restore_file=None,
               train_batch_size=32, use_cuda=True, validation_fraction=0.1,
               warmup_proportion=0.1)

In [21]:
model.save('bert-sentiment.model')

In [22]:
model.score(X_test, y_test)

HBox(children=(FloatProgress(value=0.0, description='Testing', max=125.0, style=ProgressStyle(description_widt…



Loss: 0.9879, Accuracy: 69.20%


69.19999999999999

In [23]:
y_pred = model.predict(X_test)
f1_score(y_pred, y_test)

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=125.0, style=ProgressStyle(description_w…




In [25]:
from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred)
print(cm)
print()

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[[ 19   2   3   1   1   0   1  30   0]
 [  5   3   4   0   1   1   5   9   1]
 [  0   0  93   0   1   0   6  27   9]
 [  0   0   1  35   3   0   2  26   3]
 [  2   0   0   5  74   0   1  12   4]
 [  0   0   4   0   0   4   2   9   4]
 [  1   0  10   3   1   0  51  22  10]
 [ 19   0  21  11  11   1  25 226  15]
 [  2   0   9   0   2   1  17  13 116]]

                                                   precision    recall  f1-score   support

                               caution_and_advice       0.40      0.33      0.36        57
                 displaced_people_and_evacuations       0.60      0.10      0.18        29
donation_needs_or_offers_or_volunteering_services       0.64      0.68      0.66       136
              infrastructure_and_utilities_damage       0.64      0.50      0.56        70
                           injured_or_dead_people       0.79      0.76      0.77        98
                  missing_trapped_or_found_people       0.57      0.17      0.27        23
         

In [26]:
# BERT error analysis
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

errors: 0


In [27]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,X_train,y_train,cv=3)
print(sum(scores)/len(scores))

Building sklearn text classifier...
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 600, validation data size: 66



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 1, Train loss: 1.9419, Val loss: 1.4761, Val accy: 53.03%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 2, Train loss: 1.3023, Val loss: 1.1524, Val accy: 63.64%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 3, Train loss: 0.8672, Val loss: 1.0916, Val accy: 63.64%



HBox(children=(FloatProgress(value=0.0, description='Testing', max=42.0, style=ProgressStyle(description_width…



Loss: 1.0770, Accuracy: 65.27%
Building sklearn text classifier...
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 601, validation data size: 66



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 1, Train loss: 1.9696, Val loss: 1.9856, Val accy: 31.82%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 2, Train loss: 1.5138, Val loss: 1.6852, Val accy: 42.42%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 3, Train loss: 1.1327, Val loss: 1.2664, Val accy: 60.61%



HBox(children=(FloatProgress(value=0.0, description='Testing', max=42.0, style=ProgressStyle(description_width…



Loss: 1.2279, Accuracy: 63.36%
Building sklearn text classifier...
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 601, validation data size: 66



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 1, Train loss: 2.0201, Val loss: 1.9137, Val accy: 33.33%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 2, Train loss: 1.7450, Val loss: 1.7583, Val accy: 39.39%



HBox(children=(FloatProgress(value=0.0, description='Training  ', max=19.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Validating', max=9.0, style=ProgressStyle(description_wid…



Epoch 3, Train loss: 1.4801, Val loss: 1.5386, Val accy: 50.00%



HBox(children=(FloatProgress(value=0.0, description='Testing', max=42.0, style=ProgressStyle(description_width…



Loss: 1.4663, Accuracy: 55.26%
61.29602656548764
