In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import math


In [6]:
# lire données

PREFIX = "/content/drive/MyDrive/Colab Notebooks/"

EMBEDDINGS = PREFIX+"embeddings.txt"
VOCAB = PREFIX+"vocab.txt"

POS_TWEETS = PREFIX+"twitter-datasets/train_pos.txt"
NEG_TWEETS = PREFIX+"twitter-datasets/train_neg.txt"
TEST_DATA = PREFIX+"twitter-datasets/test_data.txt"

# parse embeddings
vecs = {}
with open(EMBEDDINGS, "r") as f:
    for line in f:
        pline = line.rstrip().split(' ')
        word = pline[0]
        vecs[word] = np.array([float(x) for x in pline[1:]])

# parse vocabulary and build an index
with open(VOCAB, "r") as f:
    vocab = {x.rstrip().split(' ')[0]: i for i,x in enumerate(f)}

embeddings = np.zeros((len(vocab), len(vecs[list(vecs.keys())[0]])))
for w, v in vecs.items():
    if w == "<unk>":
        continue
    embeddings[vocab[w], :] = v

with open(NEG_TWEETS, "r") as f:
    n_tweets = [line.rstrip().split() for line in f]
with open(POS_TWEETS, "r") as f:
    p_tweets = [line.rstrip().split() for line in f]

# Stack the two lists together (will be used to see max_length of tweet)
combined_tweets = n_tweets + p_tweets


In [7]:
testing_tweets = []
testing_tweets_ids = []
with open(TEST_DATA, "r") as f:
    for line in f:
        parsed_line = line.rstrip().split(',')
        testing_tweets.append(','.join(parsed_line[1:]).split())
        testing_tweets_ids.append(int(parsed_line[0]))


In [12]:
# convert a tweet to an embedding of shape (max_length,) which is the length of maximal tweet, so added padding

def modified_load_tweets(tweets_list, series, max_tweet_length, label=None ):
    print("Loading tweets...")
    i = 0
    tot = len(tweets_list)

    vocab_keys = set(vocab.keys())

    for tweet in tweets_list:
        if i % 1000 == 0:
            print(f"{i}/{tot} ({int(i/tot*100)} %)")


       # Filter out words not in the vocabulary
        tweet_filtered = [word for word in tweet if word in vocab_keys]

        # Create a tensor directly without using list comprehensions

        embeddings_list_torch = torch.FloatTensor(np.array([embeddings[vocab[word]] for word in tweet_filtered]))

        length_tweet=len(tweet_filtered)
        length_list.append(length_tweet)

        diff_length=max_tweet_length - length_tweet

        if length_tweet == 0:
            tweet_embeddings = torch.zeros((max_tweet_length, len(vecs[list(vecs.keys())[0]])))
        else:
           tweet_mean = torch.mean(embeddings_list_torch, axis=0)
           tweet_embeddings = torch.ones((max_tweet_length, len(vecs[list(vecs.keys())[0]])))*tweet_mean # to have all tweets of shape (#tweets, max_tweet_len, 20)
           middle = diff_length//2
           if (diff_length%2==0):
               tweet_embeddings[middle:max_tweet_length-middle,:] = embeddings_list_torch #putting them in the middle to do some kind of padding
           else:
               tweet_embeddings[middle:(max_tweet_length-(middle+1)),:] = embeddings_list_torch
        if label is not None:
            tweet_embeddings = torch.vstack((tweet_embeddings, label*torch.ones(20,)))

        series[i] = tweet_embeddings
        i += 1

    return series

In [24]:
def save(torch_series_train, torch_series_test):
    SUFFIX = '.pt'

    X = torch_series_train[:,:-1]
    y = torch_series_train[:,-1]
    RANDOM_SEED = 1234

    X_flattened = X.view(X.size(0), -1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)
    X_train = X_train.view(X_train.size(0), max_tot, 20)
    X_test = X_test.view(X_test.size(0), max_tot, 20)
    torch.save(X_train, PREFIX+'X_train_cnn_new'+SUFFIX)
    torch.save(X_test, PREFIX+'X_test_cnn_new'+SUFFIX)
    torch.save(y_train, PREFIX+'y_train_cnn_new'+SUFFIX)
    torch.save(y_test, PREFIX+'y_test_cnn_new'+SUFFIX)
    torch.save(torch_series_test, PREFIX+'X_T.pt')


In [19]:
max_length_train = max(len(tweet) for tweet in combined_tweets)
max_length_test = max(len(tweet) for tweet in testing_tweets)
max_tot=max(max_length_train, max_length_test)

series_train = torch.zeros((len(combined_tweets), max_tot+1, 20)) # +1 for label
series_test = torch.zeros((len(testing_tweets), max_tot, 20))
length_list = []
# add both negative and positive tweets, will be shuffled later
series_train = modified_load_tweets(p_tweets, series_train, max_tot, 1)
series_train = modified_load_tweets(n_tweets, series_train, max_tot, -1)

# no label since this is the prediction set
series_test = modified_load_tweets(testing_tweets, series_test, max_tot)

Loading tweets...
0/100000 (0 %)
1000/100000 (1 %)
2000/100000 (2 %)
3000/100000 (3 %)
4000/100000 (4 %)
5000/100000 (5 %)
6000/100000 (6 %)
7000/100000 (7 %)
8000/100000 (8 %)
9000/100000 (9 %)
10000/100000 (10 %)
11000/100000 (11 %)
12000/100000 (12 %)
13000/100000 (13 %)
14000/100000 (14 %)
15000/100000 (15 %)
16000/100000 (16 %)
17000/100000 (17 %)
18000/100000 (18 %)
19000/100000 (19 %)
20000/100000 (20 %)
21000/100000 (21 %)
22000/100000 (22 %)
23000/100000 (23 %)
24000/100000 (24 %)
25000/100000 (25 %)
26000/100000 (26 %)
27000/100000 (27 %)
28000/100000 (28 %)
29000/100000 (28 %)
30000/100000 (30 %)
31000/100000 (31 %)
32000/100000 (32 %)
33000/100000 (33 %)
34000/100000 (34 %)
35000/100000 (35 %)
36000/100000 (36 %)
37000/100000 (37 %)
38000/100000 (38 %)
39000/100000 (39 %)
40000/100000 (40 %)
41000/100000 (41 %)
42000/100000 (42 %)
43000/100000 (43 %)
44000/100000 (44 %)
45000/100000 (45 %)
46000/100000 (46 %)
47000/100000 (47 %)
48000/100000 (48 %)
49000/100000 (49 %)
50000

TypeError: ignored

In [25]:
save(series_train, series_test)