In [15]:
from models.bow import *
from utils import *
from embeddings.glove import *
from embeddings.tfidf import *
from models.nn import *
from torch.utils.data import DataLoader, TensorDataset
import torch
import pickle
from models.rnn import *
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocessing import *
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from models.one_hot_vector import convert_to_one_hot_vec, get_features_ohv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [9]:
full = False

tweets = []
labels = []

if (not full):
    load_tweets(SMALL_TRAIN_POS, 0, tweets, labels)
    load_tweets(SMALL_TRAIN_NEG, 1, tweets, labels)
else:
    load_tweets(TRAIN_POS, 0, tweets, labels)
    load_tweets(TRAIN_NEG, 0, tweets, labels)

In [10]:
X_train, X_val, Y_train, Y_val = split_train_test(np.array(tweets), np.array(labels), 1)

In [5]:
tweets_pp, labels_pp = preprocess(tweets, labels)
separator = " "  # Define the separator, which in this case is a space
tweets_pp = [separator.join(tweet) for tweet in tweets_pp]
X_train_pp, X_val_pp, Y_train_pp, Y_val_pp = split_train_test(np.array(tweets_pp), np.array(labels_pp), 1)

In [5]:
sample_tweets = [
    ["worries", "fml", "tooo", "seeee", "youuuu"],
    ["thiiis", "is", "aaamazing", "and", "coooool"],
    ["whaaaaat", "a", "beauuutiful", "daaaay"],
    ["heyy", "theeerreeee", "what's", "uuup"]
]

# Applying the function to the sample tweets
processed_tweets = remove_repeated(sample_tweets)
processed_tweets = expand_abbreviations(processed_tweets, ABBREVIATIONS)

# Display the results
for original, processed in zip(sample_tweets, processed_tweets):
    print("Original:", original)
    print("Processed:", processed)
    print()

Original: ['worries', 'fml', 'tooo', 'seeee', 'youuuu']
Processed: ['worries', 'fuck my life', 'too', 'see', 'youu']

Original: ['thiiis', 'is', 'aaamazing', 'and', 'coooool']
Processed: ['thiis', 'is', 'aamazing', 'and', 'cool']

Original: ['whaaaaat', 'a', 'beauuutiful', 'daaaay']
Processed: ['whaat', 'a', 'beauutiful', 'daay']

Original: ['heyy', 'theeerreeee', "what's", 'uuup']
Processed: ['heyy', 'theerree', "what's", 'uup']



# Bag of Words

In [8]:
bow_1 = bow(X_train, X_val, Y_train, Y_val)

---- Top 10 negative words
yougetmajorpointsif -5.116773458519785
bahaha -4.080151608532275
smartnokialumia -3.5082983071390537
waystomakemehappy -3.440534139678091
worries -3.0610592153485476
harrypotterchatuplines -2.9260008416897767
thanx -2.6408653121258454
therefore -2.5497920943340184
ifindthatattractive -2.5286355540916654
photographer -2.4957547617473956

---- Top 10 positive words
electronics 3.4836681586846003
rip 3.4842811276775705
apparel 3.700831391681254
depressed 3.754107838340755
misc 3.978292522735455
depressing 4.076664533464381
sadtweet 4.119527629840292
saddest 5.333012560746026
hardcover 7.501051086695297
paperback 8.309226555870515

ACCURACY: 0.802
RECALL: 0.7660128102481986
F1: 0.7944778908034047
PRECISION: 0.8251401466149202
Validation Accuracy: 0.802
              precision    recall  f1-score   support

           0       0.78      0.84      0.81     10008
           1       0.83      0.77      0.79      9992

    accuracy                           0.80     20

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
bow_2 = bow(X_train_pp, X_val_pp, Y_train_pp, Y_val_pp)

---- Top 10 negative words
thankss -3.1107689449136267
worries -2.5128494792527762
blessing -2.4029983938400172
nf -2.2614414333832924
sweetest -2.1953173007267406
ayee -2.141352392520665
funn -2.1317873270763896
appreciated -2.1188804172517264
tuned -2.116599169521241
pumped -2.1050817519357614

---- Top 10 positive words
dvd 3.184057734908375
depressed 3.2678905108600227
saddest 3.439264162061857
guides 3.4657501730403544
apparel 3.6387830502101517
depressing 4.026632591006754
electronics 4.15162325096423
misc 4.841221858669179
hardcover 8.281027014245268
paperback 9.038994646479518

ACCURACY: 0.7879874248524626
RECALL: 0.7503558524033724
F1: 0.780968660968661
PRECISION: 0.8141855768088393
Validation Accuracy: 0.7879874248524626
              precision    recall  f1-score   support

           0       0.77      0.83      0.79      8998
           1       0.81      0.75      0.78      9133

    accuracy                           0.79     18131
   macro avg       0.79      0.79      0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# One-hot vector

In [16]:
# build vocabulary
vocab = build_vocab(tweets)

# we only keep the 5000 most frequent words, both to reduce the computational cost and reduce overfitting
vectorizer = CountVectorizer(vocabulary=vocab)
vectorizer.fit_transform(tweets)

# convert tweets to one-hot vectors
X_train_ohv = []
X_val_ohv = []
for tweet in tqdm(X_train, desc="Converting training set"):
    X_train_ohv.append(convert_to_one_hot_vec(tweet, vocab))
for tweet in tqdm(X_val, desc="Converting validation set"):
    X_val_ohv.append(convert_to_one_hot_vec(tweet, vocab))

# train logistic regression model
model = LogisticRegression(C=1e5, max_iter=100)
model.fit(X_train_ohv, Y_train)

# print features
get_features_ohv(model, vectorizer)

# predict on validation set
y_pred = model.predict(X_val_ohv)

# print metrics
get_basic_metrics(y_pred, Y_val)

Converting training set: 100%|██████████| 180000/180000 [00:04<00:00, 40396.25it/s]
Converting validation set: 100%|██████████| 20000/20000 [00:00<00:00, 39246.91it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---- Top 10 negative words
#yougetmajorpointsif -4.737628233664645
bahaha -4.3701856188649
#smartnokialumia -3.850843746104472
#waystomakemehappy -3.5979460560038636
worries -3.147664480202091
#harrypotterchatuplines -3.0716603696599694
therefore -2.8550036454535865
delicious -2.668267767752255
thanx -2.514064617157143
impress -2.514044352774007

---- Top 10 positive words
unfair 3.599135127718332
wahhh 3.6176393927719346
fml 3.661910284924043
depressing 3.7873509153737426
rip 3.8071883048871356
#sadtweet 4.10726439302038
hardcover 4.919052435466942
(8 4.99767691141605
saddest 5.665785725193116
paperback 5.998566556266123

ACCURACY: 0.8223
RECALL: 0.7906325060048038
F1: 0.8163687093107369
PRECISION: 0.8438367870113224


# Word embeddings

In [71]:
GLOVE_WIKI_200D = 'data/glove_wiki/glove.6B.200d.txt'

In [68]:
from utils import GLOVE_TWEET_100D, GLOVE_WIKI_100D, GLOVE_TWEET_200D, GLOVE_WIKI_200D
from embeddings.glove import all_tweets_to_glove

ImportError: cannot import name 'GLOVE_WIKI_200D' from 'utils' (/Users/selimjerad/Desktop/whitewashed/utils.py)

In [90]:
X_train_glove_wiki = all_tweets_to_glove(X_train, GLOVE_WIKI_200D, 200)
X_val_glove_wiki = all_tweets_to_glove(X_val, GLOVE_WIKI_200D, 200)

In [91]:
X_train_glove_tweet = all_tweets_to_glove(X_train, GLOVE_TWEET_200D, 200)
X_val_glove_tweet = all_tweets_to_glove(X_val, GLOVE_TWEET_200D, 200)

In [66]:
vocabulary = build_vocab(tweets)
X_train_tfidf, _ = transform_tweets_to_tfidf(X_train, vocabulary)
X_val_tfidf, _ = transform_tweets_to_tfidf(X_val, vocabulary)

In [93]:
clf = LogisticRegression()
clf.fit(X_train_glove_wiki, Y_train)
get_basic_metrics(clf.predict(X_val_glove_wiki), Y_val)

ACCURACY: 0.66755
RECALL: 0.6381104883907126
F1: 0.6572857069223236
PRECISION: 0.677649059411202


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [94]:
clf = LogisticRegression()
clf.fit(X_train_glove_tweet, Y_train)
get_basic_metrics(clf.predict(X_val_glove_tweet), Y_val)

ACCURACY: 0.6795
RECALL: 0.6531224979983987
F1: 0.6706402219710205
PRECISION: 0.6891235480464625


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
clf = LogisticRegression()
clf.fit(X_train_tfidf, Y_train)
get_basic_metrics(clf.predict(X_val_tfidf), Y_val)

ACCURACY: 0.80015
RECALL: 0.7748198558847078
F1: 0.7948257276320517
PRECISION: 0.8158920855727685


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Neural networks

In [None]:
from models.nn import *

### GloVe + NN

In [None]:
X_train = all_tweets_to_glove(X_train,  GLOVE_TWEET_200D, 200)
X_val = all_tweets_to_glove(X_val, GLOVE_TWEET_200D, 200)

In [None]:
train_dataset = TensorDataset(torch.tensor(X_train).to(torch.float32), torch.tensor(Y_train).to(torch.float32))
test_dataset = TensorDataset(torch.tensor(X_val).to(torch.float32), torch.tensor(Y_val).to(torch.float32))

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = train_simple_nn(train_loader, 200)
test_simple_nn(test_loader, model)

### TF-IDF + NN

In [None]:
X_train_dense = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
X_val_dense = torch.tensor(X_val_tfidf.toarray(), dtype=torch.float32)

In [None]:
train_dataset = TensorDataset(X_train_dense, torch.tensor(Y_train).to(torch.float32))
test_dataset = TensorDataset(X_val_dense, torch.tensor(Y_val).to(torch.float32))

In [None]:
# TF-IDF + NN
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = train_simple_nn(train_loader, 5000)
test_simple_nn(test_loader, model)

# Naïve Bayes

# Recurrent Neural Network

In [6]:
batch_size = 32

In [9]:
tokens_train, tokens_val = get_tokens_rnn(X_train, X_val)
train_dataset = TensorDataset(torch.tensor(tokens_train, dtype=torch.long),
                              torch.from_numpy(Y_train.astype(np.float32)))
test_dataset = TensorDataset(torch.tensor(tokens_val, dtype=torch.long), torch.from_numpy(Y_val.astype(np.float32)))

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [10]:
model = train_rnn(train_loader)
test_rnn(test_loader, model)

Epoch 1/20, Loss: 0.40875697135925293
Epoch 2/20, Loss: 0.4908829927444458
Epoch 3/20, Loss: 0.4381594955921173
Epoch 4/20, Loss: 0.42911550402641296
Epoch 5/20, Loss: 0.4773136079311371
Epoch 6/20, Loss: 0.443154901266098
Epoch 7/20, Loss: 0.2574199140071869
Epoch 8/20, Loss: 0.418632447719574
Epoch 9/20, Loss: 0.29267454147338867
Epoch 10/20, Loss: 0.32499146461486816
Epoch 11/20, Loss: 0.4105643332004547
Epoch 12/20, Loss: 0.32943469285964966
Epoch 13/20, Loss: 0.43922320008277893
Epoch 14/20, Loss: 0.36273807287216187
Epoch 15/20, Loss: 0.2680851221084595
Epoch 16/20, Loss: 0.5699405074119568
Epoch 17/20, Loss: 0.30537500977516174
Epoch 18/20, Loss: 0.28328511118888855
Epoch 19/20, Loss: 0.313154011964798
Epoch 20/20, Loss: 0.325120210647583
Validation Accuracy: 0.80745
              precision    recall  f1-score   support

         0.0       0.79      0.83      0.81     10008
         1.0       0.82      0.78      0.80      9992

    accuracy                           0.81     200

In [7]:
tokens_train, tokens_val = get_tokens_rnn(X_train_pp, X_val_pp)
train_dataset = TensorDataset(torch.tensor(tokens_train, dtype=torch.long),
                              torch.from_numpy(Y_train_pp.astype(np.float32)))
test_dataset = TensorDataset(torch.tensor(tokens_val, dtype=torch.long), torch.from_numpy(Y_val_pp.astype(np.float32)))

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

AssertionError: Size mismatch between tensors

In [None]:
model = train_rnn(train_loader)
test_rnn(test_loader, model)