In [1]:
import pandas as pd

df = pd.read_csv("data/raw/training.1600000.processed.noemoticon.csv",
                 encoding="latin1", header=None)

df.columns = ["label", "id", "date", "query", "user", "text"]
df = df[["text", "label"]]  
print(df.head())


                                                text  label
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...      0
1  is upset that he can't update his Facebook by ...      0
2  @Kenichan I dived many times for the ball. Man...      0
3    my whole body feels itchy and like its on fire       0
4  @nationwideclass no, it's not behaving at all....      0


In [2]:
df['label'].unique()

array([0, 4], dtype=int64)

In [3]:
from sklearn.model_selection import train_test_split

train_df, remaining = train_test_split(df, test_size=0.36, random_state=42)
val_df, test_df = train_test_split(remaining, test_size=0.4444, random_state=42)

print(f"Train: {len(train_df)/len(df):.2%}")
print(f"Val:   {len(val_df)/len(df):.2%}")
print(f"Test:  {len(test_df)/len(df):.2%}")

Train: 64.00%
Val:   20.00%
Test:  16.00%


In [4]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"http\S+|@\w+|#\w+|<.*?>|[^\w\s]", "", str(text))
    text = text.lower()
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

for part in [train_df, val_df, test_df]:
    part["clean_text"] = part["text"].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from nltk.tokenize import word_tokenize
nltk.download("punkt")

for part in [train_df, val_df, test_df]:
    part["tokens"] = part["clean_text"].apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
train_df.head()

Unnamed: 0,text,label,clean_text,tokens
657201,Can I get Wi-Fi for my iPhone on the plane tom...,0,get wifi iphone plane tomorrow put airplane mo...,"[get, wifi, iphone, plane, tomorrow, put, airp..."
195977,Muh bro is going to Busch Gardens....I'm not.....,0,muh bro going busch gardensim,"[muh, bro, going, busch, gardensim]"
839059,My friend matt is building a skateboard ramp.....,4,friend matt building skateboard ramp perfect s...,"[friend, matt, building, skateboard, ramp, per..."
995544,loading boxcar racer in laptop see,4,loading boxcar racer laptop see,"[loading, boxcar, racer, laptop, see]"
775883,@shoe_8 i miss you too text me?,0,miss text,"[miss, text]"


In [7]:
from collections import Counter

counter = Counter([w for tokens in train_df["tokens"] for w in tokens])
vocab = {w: i+2 for i, (w, c) in enumerate(counter.items()) if c > 1}
vocab["<pad>"], vocab["<unk>"] = 0, 1
print("Vocab size:", len(vocab))

Vocab size: 100943


In [8]:
def encode(tokens): 
    return [vocab.get(w, vocab["<unk>"]) for w in tokens]

for part in [train_df, val_df, test_df]:
    part["encoded"] = part["tokens"].apply(encode)


In [9]:
test_df.head()

Unnamed: 0,text,label,clean_text,tokens,encoded
1495891,my brother is back from uk for the summer!yeahy,4,brother back uk summeryeahy,"[brother, back, uk, summeryeahy]","[1770, 285, 1173, 1]"
245445,can't sleep yet but trying to be happy.. plea...,0,cant sleep yet trying happy please check wwwpt...,"[cant, sleep, yet, trying, happy, please, chec...","[209, 382, 673, 685, 486, 10, 1449, 1]"
164271,@grayguitar @andrewconnell I'm starting to fee...,0,im starting feel depressed hurricane talk im f...,"[im, starting, feel, depressed, hurricane, tal...","[54, 2130, 834, 4733, 17005, 871, 54, 1999, 8881]"
1452627,"@nessiecullenxD yah!! yah!! i love that, too.",4,yah yah love,"[yah, yah, love]","[5446, 5446, 542]"
998449,@jamesmills I'm gonna start calling you aidan ...,4,im gonna start calling aidan cant work technology,"[im, gon, na, start, calling, aidan, cant, wor...","[54, 578, 76, 1036, 4905, 4185, 209, 227, 6944]"


In [10]:
from torch.nn.utils.rnn import pad_sequence
import torch

MAX_LEN = 50  

def pad_batch(sequences, max_len=MAX_LEN):
    tensor_seqs = [torch.tensor(seq[:max_len]) for seq in sequences]
    return pad_sequence(tensor_seqs, batch_first=True, padding_value=0)

train_padded = pad_batch(train_df["encoded"])
val_padded   = pad_batch(val_df["encoded"])
test_padded  = pad_batch(test_df["encoded"])


In [11]:
print(train_padded)

tensor([[   2,    3,    4,  ...,    0,    0,    0],
        [  12,   13,   14,  ...,    0,    0,    0],
        [  17,   18,   19,  ...,    0,    0,    0],
        ...,
        [1033, 1767,    1,  ...,    0,    0,    0],
        [2216,  663,  389,  ...,    0,    0,    0],
        [ 549, 2248,  272,  ...,    0,    0,    0]])


In [12]:
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-100")  # 100-мерные

embedding_matrix = torch.zeros(len(vocab), 100)
for i, word in enumerate(vocab.keys()):
    if word in glove.key_to_index:
        embedding_matrix[i] = torch.tensor(glove[word])
    else:
        embedding_matrix[i] = torch.randn(100)


In [15]:
embedding_matrix

tensor([[ 0.1443,  0.4395,  0.5832,  ...,  0.5013,  0.4954,  0.4992],
        [-0.0191,  0.3890,  0.2648,  ..., -0.2924, -0.3516,  0.2613],
        [-0.3609, -0.5975,  0.5235,  ...,  0.2213,  1.4631,  0.2227],
        ...,
        [-0.0554,  1.0538,  0.3631,  ...,  1.4923, -0.6138,  2.1924],
        [ 0.5921, -0.7197,  1.0233,  ...,  0.0659,  0.5075, -0.5626],
        [-0.2779,  0.1448, -0.2056,  ...,  1.1642,  1.0713, -0.2996]])