In [30]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import torch
import torch.nn as nn
import torchtext
import re
import string

# https://github.com/rsreetech/PyTorchTextClassificationCustomDataset/blob/main/PyTorchTweetTextClassification.ipynb

In [118]:
def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 
    
    return text2.lower()

In [119]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [120]:
train_df.drop(columns=["id", "keyword", "location"], inplace=True)
test_df.drop(columns=["id", "keyword", "location"], inplace=True)


In [121]:
print('-------Train data--------')
print(train_df['target'].value_counts())
print(len(train_df))
print('-------------------------')


-------Train data--------
target
0    4342
1    3271
Name: count, dtype: int64
7613
-------------------------


In [122]:
train_df["text"] = train_df["text"].apply(remove_url)
train_df["text"] = train_df["text"].apply(remove_emoji)
train_df["text"] = train_df["text"].apply(clean_text)

test_df["text"] = test_df["text"].apply(remove_url)
test_df["text"] = test_df["text"].apply(remove_emoji)
test_df["text"] = test_df["text"].apply(clean_text)

In [123]:
X_train, X_valid, Y_train, Y_valid= model_selection.train_test_split(train_df['text'].to_list(),
                                                                     train_df['target'].to_list(),
                                                                     test_size=0.2,
                                                                     stratify = train_df['target'].to_list(),
                                                                     random_state=0)


In [124]:
X_train

['theres fire the catalinas looks kinda cool this picture doesnt justice',
 'veldfest announces refunds after day twos extreme weather evacuation',
 'bomairinge elutranscendent straight body bagging',
 'and here was complaining about phoenix mode fire emblem turns out ray gigant will have difficulty option where you take damage',
 'iphooey time ironically michele bachmann brought this wron paul amp everyone blew her off and called hoax she was finally right',
 'orchid sign the witch',
 'the use perforated metal shear panel sfor seismicresistant applications',
 'families sue over legionnaires more than families affected the fatal outbreak legionnaires disea',
 'help yourself those you love who suffer from selfesteem wounds you can today',
 'reuters debris confirmed from mh370 relatives hope for discovery crash site',
 'bamenda floods kill animals birds',
 'various issues fail derail homes bid',
 'travelelixir any idea whats going hear sirens but this damn helo flying low apt shaking',
 

In [63]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [65]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [125]:
train_df

Unnamed: 0,text,target
0,our deeds are the reason this earthquake may a...,1
1,forest fire near ronge sask canada,1
2,all residents asked shelter place are being no...,1
3,people receive wildfires evacuation orders cal...,1
4,just got sent this photo from ruby alaska smok...,1
...,...,...
7608,two giant cranes holding bridge collapse into ...,1
7609,ariaahrary thetawniest the out control wild fi...,1
7610,m194 utc5km volcano hawaii,1
7611,police investigating after ebike collided with...,1


In [126]:
tokenizer = get_tokenizer('basic_english')
train_iter = train_df

def yield_tokens(data_iter):
    for text in data_iter["text"]:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [127]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) 

In [128]:
text_pipeline('here is the an example')

[107, 0, 1, 0, 2828]

In [129]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


In [None]:
# Reprendre https://github.com/rsreetech/PyTorchTextClassificationCustomDataset/blob/main/PyTorchTweetTextClassification.ipynb