In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import nltk
from nltk.corpus import stopwords

In [15]:
args = Namespace(
    raw_dataset_csv="english_dataset.tsv",
    train_proportion=0.85,
    val_proportion=0.15,
    
    output_munged_csv="english_dataset_munged.tsv",
    seed=1337
)

In [16]:
df = pd.read_csv("english_dataset.tsv", sep="\\t", engine='python')

In [17]:
df.head()

Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_en_1,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT,NONE,NONE
1,hasoc_en_2,@politico No. We should remember very clearly ...,HOF,HATE,TIN
2,hasoc_en_3,@cricketworldcup Guess who would be the winner...,NOT,NONE,NONE
3,hasoc_en_4,Corbyn is too politically intellectual for #Bo...,NOT,NONE,NONE
4,hasoc_en_5,All the best to #TeamIndia for another swimmin...,NOT,NONE,NONE


In [18]:
set(df.task_1)

{'HOF', 'NOT'}

In [19]:
df_new = df[["text", "task_1"]]

In [20]:
by_category = collections.defaultdict(list)
for _, row in df_new.iterrows():
  by_category[row.task_1].append(row.to_dict())
print(by_category)



In [25]:
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_category.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    #n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    #for item in item_list[n_train+n_val:]:
        #item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [26]:
final_text = pd.DataFrame(final_list)

In [27]:
final_text.split.value_counts()

train    4975
val       877
Name: split, dtype: int64

In [32]:
def preprocess_text(text):
  
  text = re.sub("@[\w]*\b", " ", text)
  text = re.sub("[^a-zA-Z]", " ", text)  
  text = ' '.join(word.lower() for word in text.split(" ") if word not in nltk.corpus.stopwords.words('english'))
  return text

final_text.text = final_text.text.apply(preprocess_text)

In [33]:
final_text.text.head()

0    guy longtime gop strategist  us already though...
1    dumbass chief dumbassery   fucktrump https   c...
2     amvetsupport stepped things integrity donald ...
3    seen rally yesterday      translation   leave ...
4     citynews  twitter think done quite enough alr...
Name: text, dtype: object

In [35]:
vocab = (set(final_text.text))

In [36]:
len(vocab)

5827

In [37]:
final_text.to_csv(args.output_munged_csv, index=False)