In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
from typing import Dict

from allennlp.data import DatasetReader, Tokenizer, TokenIndexer, Instance, Vocabulary
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.fields import TextField, LabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import WhitespaceTokenizer

In [11]:
df.head()

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,2017.0,R
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,2020.0,R
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,2014.0,R
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,2017.0,R
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,2019.0,D


In [14]:
class KaggleReader(DatasetReader):
    def __init__(self,
                 max_instances = 600_000,
                 tokenizer = None,
                 token_indexers = None,
                 max_tokens= None):

        super().__init__(max_instances = max_instances) # initialize DatasetReader
        self.tokenizer = tokenizer or WhitespaceTokenizer() # If tokenizer not provided: WhitespaceTokenizer
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} # If token_indexers not provided: SingleIdTokenIndexer
        self.max_tokens = max_tokens

    def text_to_instance(self, fields):
        return Instance(fields) # Instance 是imported class

    def _read(self, file_path: str):
        df = pd.read_csv(file_path, index_col=0)
        for _ , row in df.iterrows():
            text = row["full_text"]

            tokens = self.tokenizer.tokenize(text)
            if self.max_tokens:
                tokens = tokens[:self.max_tokens]

            text_field = TextField(tokens, self.token_indexers)
            label_field = LabelField(row["party_id"])

            fields = {'text': text_field, 'label': label_field}
            yield self.text_to_instance(fields)


In [28]:
class KaggleReader(DatasetReader):
    def __init__(self,
                 max_instances = 100,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = None):
        super().__init__(max_instances = max_instances)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens
    
    def text_to_instance(self, fields):
        return Instance(fields)
    
    def _read(self, file_path: str):
        df = pd.read_csv(file_path, index_col=0)
        for _ , row in df.iterrows():
            text = row["full_text"]
            tokens = self.tokenizer.tokenize(text)
            if self.max_tokens:
                tokens = tokens[:self.max_tokens]
            text_field = TextField(tokens, self.token_indexers)
            label_field = LabelField(row["party_id"])
            fields = {'text': text_field, 'label': label_field}
            yield self.text_to_instance(fields)

In [8]:
data_path = "congressional_tweet.csv"

In [22]:
import nltk
tag = nltk.pos_tag(nltk.word_tokenize(df.full_text[0]))

In [29]:
reader = KaggleReader()
dataloader = MultiProcessDataLoader(reader, data_path ,batch_size=2)
instances = dataloader.iter_instances()
vocab = Vocabulary.from_instances(instances)
dataloader.index_with(vocab)

for batch in dataloader:
    print(batch)
    break

loading instances: 0it [00:00, ?it/s]

building vocab: 0it [00:00, ?it/s]

{'text': {'tokens': {'tokens': tensor([[ 49, 262, 263,   5,  11, 264, 265,  50,   7, 266, 267, 117,  33,   4,
         118,  71,  72, 268, 269, 119,  27,   6, 270, 271,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0],
        [272,  51,  73,   2, 273,   3, 120, 274,   6, 275, 276, 277,   3, 121,
         122,  12,  39, 278,   2, 279, 280,  73,   2, 281,   3, 282, 283, 284,
          16, 123,   8, 124,  34, 285,   3,  40, 286, 125,   8,  40, 287, 288,
         289, 290]])}}, 'label': tensor([1, 1])}


In [9]:
df = pd.read_csv(data_path)

In [10]:
df.head()

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,2017.0,R
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,2020.0,R
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,2014.0,R
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,2017.0,R
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,2019.0,D
