#### Attempting my first NLP project

In [36]:
#Import libraries
import pandas as pd

In [37]:
col_names = ['sentiment','id','date','query_string','user','text']
data_path = r'/Users/OliverPan/Desktop/processed_tweets.csv'

tweets = pd.read_csv(data_path, header=None, names=col_names, encoding="ISO-8859-1").sample(frac=1) # .sample(frac=1) shuffles the data
tweets = tweets[['sentiment', 'text']] 

In [38]:
#Process tweet format
import re

allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?'
punct = '!?,.@#'
maxlen = 280

def preprocess(text):
    return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]])[:maxlen]

In [39]:
#Reformat data for Flair
tweets['text'] = tweets['text'].apply(preprocess)
tweets['sentiment'] = '__label__' + tweets['sentiment'].astype(str)

In [40]:
#Save data
import os

# Create directory for saving data if it does not already exist
data_dir = './processed-data'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Save a percentage of the data (you could also only load a fraction of the data instead)
amount = 0.25

# Splitting into three test,train,dev folders
tweets.iloc[0:int(len(tweets)*0.8*amount)].to_csv(data_dir + '/train_tweet.csv', sep='\t', index=False, header=False)
tweets.iloc[int(len(tweets)*0.8*amount):int(len(tweets)*0.9*amount)].to_csv(data_dir + '/test_tweet.csv', sep='\t', index=False, header=False)
tweets.iloc[int(len(tweets)*0.9*amount):int(len(tweets)*1.0*amount)].to_csv(data_dir + '/dev_tweet.csv', sep='\t', index=False, header=False)

In [41]:
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

# this is the folder in which train, test and dev files reside
data_folder = '/Users/OliverPan/Desktop/tweets'

# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder,
                                      test_file='test_tweet.csv',
                                      dev_file='dev_tweet.csv',
                                      train_file='train_tweet.csv')


2020-10-30 23:46:28,560 Reading data from /Users/OliverPan/Desktop/tweets
2020-10-30 23:46:28,561 Train: /Users/OliverPan/Desktop/tweets/train_tweet.csv
2020-10-30 23:46:28,562 Dev: /Users/OliverPan/Desktop/tweets/dev_tweet.csv
2020-10-30 23:46:28,563 Test: /Users/OliverPan/Desktop/tweets/test_tweet.csv


In [42]:
label_dict = corpus.make_label_dictionary()

2020-10-30 23:46:29,800 Computing label dictionary. Progress:


100%|██████████| 180000/180000 [00:27<00:00, 6523.78it/s]

2020-10-30 23:47:26,990 [b'4', b'0']





In [43]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings

word_embeddings = [WordEmbeddings('glove'),
                    FlairEmbeddings('news-forward'),
                    FlairEmbeddings('news-backward')
                  ]

2020-10-30 23:47:28,594 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /var/folders/18/_sh0fhkx7jdcszv81_zj2tq00000gn/T/tmparji1m3o


100%|██████████| 73034624/73034624 [00:38<00:00, 1887366.78B/s]

2020-10-30 23:48:07,754 copying /var/folders/18/_sh0fhkx7jdcszv81_zj2tq00000gn/T/tmparji1m3o to cache at /Users/oliverpan/.flair/embeddings/news-forward-0.4.1.pt
2020-10-30 23:48:07,782 removing temp file /var/folders/18/_sh0fhkx7jdcszv81_zj2tq00000gn/T/tmparji1m3o





2020-10-30 23:48:08,515 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to /var/folders/18/_sh0fhkx7jdcszv81_zj2tq00000gn/T/tmpk5u032qc


100%|██████████| 73034575/73034575 [00:36<00:00, 2006498.13B/s]

2020-10-30 23:48:45,354 copying /var/folders/18/_sh0fhkx7jdcszv81_zj2tq00000gn/T/tmpk5u032qc to cache at /Users/oliverpan/.flair/embeddings/news-backward-0.4.1.pt
2020-10-30 23:48:45,382 removing temp file /var/folders/18/_sh0fhkx7jdcszv81_zj2tq00000gn/T/tmpk5u032qc





In [44]:
from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [45]:
from flair.models import TextClassifier

classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [46]:
from flair.trainers import ModelTrainer

trainer = ModelTrainer(classifier, corpus)

In [48]:
trainer.train('model-saves',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=8,
              max_epochs=1)

2020-10-30 23:52:12,384 ----------------------------------------------------------------------------------------------------
2020-10-30 23:52:12,385 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=4196, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f9054867430>
Traceback (most recent call last):
  File "/Users/oliverpan/opt/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    self._shutdown_workers()
  File "/Users/oliverpan/opt/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/Users/oliverpan/opt/anaconda3/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/Users/oliverpan/opt/anaconda3/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/Users/oliverpan/opt/anaconda3/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/Users/oliverpan/opt/anaconda3/lib/python3.8/selectors.py", line 415, in select
    fd_event_list = self._selec

KeyboardInterrupt: 