In [1]:
import subprocess

import pandas as pd
import numpy as np

from nltk.tokenize import TweetTokenizer

from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
subprocess.run(['bash', 'download.sh'],
                stdout = subprocess.DEVNULL,
                stderr = subprocess.DEVNULL)

CompletedProcess(args=['bash', 'download.sh'], returncode=1)

In [3]:
embeddings = defaultdict(lambda: np.zeros(25))

with open('glove/glove.twitter.27B.25d.txt', 'r') as f:
    for line in f.readlines():
        line = line.split()
        embeddings[line[0]] = np.array(line[1:]).astype(np.float32)

In [6]:
tweets = pd.read_csv('tweets/tweets.csv',
                     encoding = 'latin1',
                     header = None).sample(n = 5000, random_state = 1)

tweets.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

tweets['target'] = tweets['target'].map({4: 1, 0: 0})

tweets.to_csv('tweets/tweets_5k.csv', index = False)

tweets.head()

Unnamed: 0,target,id,date,flag,user,text
514293,0,2190584004,Tue Jun 16 03:08:48 PDT 2009,NO_QUERY,Vicki_Gee,i miss nikki nu nu already shes always there ...
142282,0,1881451988,Fri May 22 04:42:15 PDT 2009,NO_QUERY,PatCashin,So I had a dream last night. I remember a sig...
403727,0,2058252964,Sat Jun 06 14:34:17 PDT 2009,NO_QUERY,deelectable,@girlyghost ohh poor sickly you (((hugs)) ho...
649503,0,2237307600,Fri Jun 19 05:34:22 PDT 2009,NO_QUERY,justinekepa,it is raining again
610789,0,2224301193,Thu Jun 18 09:20:06 PDT 2009,NO_QUERY,cmatt007,@MissKeriBaby wish I was in LA right now


In [7]:
tokenizer = TweetTokenizer()

tweets['tokens'] = tweets['text'].apply(tokenizer.tokenize)

In [8]:
def calculate_tweet_embedding(tokens):
    return np.sum(np.vstack([embeddings[token] for token in tokens]), axis = 0)

tweets['embeddings'] = tweets['tokens'].apply(calculate_tweet_embedding)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(pd.DataFrame(list(tweets['embeddings'])),
                                                    tweets['target'],
                                                    test_size = 0.2,
                                                    random_state = 1)

sc = StandardScaler()

sc.fit(x_train)

x_train_scaled = sc.transform(x_train)
x_test_scaled = sc.transform(x_test)

In [10]:
pd.DataFrame(x_train_scaled).to_csv('cleaned/x_train.csv', index = False, header = False)
pd.DataFrame(x_test_scaled).to_csv('cleaned/x_test.csv', index = False, header = False)

pd.DataFrame(y_train).to_csv('cleaned/y_train.csv', index = False, header = False)
pd.DataFrame(y_test).to_csv('cleaned/y_test.csv', index = False, header = False)