In [None]:
#for installing the packages for the 1st time use !pip install [package name]
import os, re

import pandas as pd
import numpy as np

import spacy
from html import unescape
from emoji import UNICODE_EMOJI

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

import pickle
import scipy.sparse as sparse

# Load in data

In [None]:
SENTIMENT140_DATA_DIR = 'Sentiment140.data' # sentiment 140 data set saved here
DG_DATA_DIR = 'D_G data' # D&G data set saved here
OUTPUT_DIR = 'output' # intermediate output and models saved here
FIGURES_DIR = 'figures' # figures saved here

In [None]:
# create output and figure directories
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
if not os.path.exists(FIGURES_DIR):
    os.makedirs(FIGURES_DIR)

In [None]:
dftrain = pd.read_csv(os.path.join(SENTIMENT140_DATA_DIR, "training.1600000.processed.noemoticon.csv"), encoding = "latin-1",names=["predictions","id","timestamp", "query", "user","text"])

In [None]:
dftrain = dftrain.sample(100000, random_state=42) # only use 100,000 of the 1,600,000 training tweets to save time in model training

In [None]:
dftrain.head()

In [None]:
dftest = pd.read_csv(os.path.join(SENTIMENT140_DATA_DIR, "testdata.manual.2009.06.14.csv"), encoding = "latin-1",names=["predictions","id","timestamp", "query", "user","text"])

In [None]:
dftest.head()

In [None]:
df_chopsticks = pd.read_csv(os.path.join(DG_DATA_DIR, "dolcegabbana_chopsticks_mentions_daily_expanded.csv"))

In [None]:
df_chopsticks.head()

In [None]:
df_all = pd.read_csv(os.path.join(DG_DATA_DIR, "dolcegabbana_mentions_daily_all.csv"), lineterminator='\n')

In [None]:
df_all.head()

In [None]:
# load NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# helper function for pre-processing/cleaning a tweet
def preprocessor(tweet):
    tweet = re.sub (r'@[A-Za-z0-9_]+', '_AT_USER_', tweet) # replace @X with _AT_USER_
    tweet = re.sub (r'#[A-Za-z0-9_]+', '_HASHTAG_', tweet) # replace #X with _HASTHAG_
    tweet = re.sub (r'^RT[\s]+', '', tweet) # remove RT (retweet) at the start of the tweet
    tweet = unescape(tweet) # unescape the HTML
    tweet = tweet.lower() # make everything lowercase
    return tweet

# helper function for tokenization of a tweet
def tokenizer(tweet):
    tokens = nlp(tweet) # this processes the tweet text  
    # only keep tokens (lemmatized) that are alphanumeric (including "-" and "_") and not a stop word, or represent an emoji
    tokens = [t.lemma_ for t in tokens if (re.match("^[a-zA-Z0-9_-]*$", t.text) and not t.is_stop and len(t.text) > 2) or t.text in UNICODE_EMOJI]
    return tokens

In [None]:
corpus_chopsticks = list(df_chopsticks['text']) # a list of tweets
corpus_all = list(df_all['text']) # a list of tweets
corpus_test = list(dftest['text']) # a list of tweets
corpus_train = list(dftrain['text']) # a list of tweets

In [None]:
y_train = list(dftrain['predictions'])
y_test = list(dftest['predictions'])

# Count vectorizer transformation

In [None]:
model = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, max_features=2000)
word_counts_train = model.fit_transform(corpus_train)
model_features = model.get_feature_names()
fitted_model = model

In [None]:
word_counts_test = fitted_model.transform(corpus_test)
print('count vectorizer completed on corpus_test')

word_counts_chopsticks = fitted_model.transform(corpus_chopsticks)
print('count vectorizer completed on corpus_chopsticks')

word_counts_all = fitted_model.transform(corpus_all)
print('count vectorizer completed on corpus_all')

# TF-IDF transformation

In [None]:
tfidf_transformer = TfidfTransformer()
X_140 = tfidf_transformer.fit_transform(word_counts_train)

In [None]:
X_test = tfidf_transformer.transform(word_counts_test)
print('TF-IDF completed on corpus_test')

X_chopsticks = tfidf_transformer.transform(word_counts_chopsticks)
print('TF-IDF completed on corpus_chopsticks')

X_all = tfidf_transformer.transform(word_counts_all)
print('TF-IDF completed on corpus_all')

# Split Sentiment 140 data into train, validation, and test sets

In [None]:
X_train, X_validation, y_trainsmall, y_validation = train_test_split(X_140, y_train, test_size=0.10, random_state=42, shuffle=True)

In [None]:
print(X_train.shape)
print(X_validation.shape)
print(X_test.shape)

# Save feature matrices, labels, and count vectorizer and TF-IDF models

In [None]:
with open(os.path.join(OUTPUT_DIR, 'count_vectorizer'), 'wb') as f:
    pickle.dump(fitted_model, f)
    
with open(os.path.join(OUTPUT_DIR, 'tfidf_transformer'), 'wb') as f:
    pickle.dump(tfidf_transformer, f)

np.save(os.path.join(OUTPUT_DIR, 'model_features.npy'), model_features)

# output of count vectorizer
sparse.save_npz(os.path.join(OUTPUT_DIR, 'word_counts_train.npz'), word_counts_train, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'word_counts_test.npz'), word_counts_test, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'word_counts_chopsticks.npz'), word_counts_chopsticks, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'word_counts_all.npz'), word_counts_all, compressed=True)

# output of TF-IDF
sparse.save_npz(os.path.join(OUTPUT_DIR, 'X_140.npz'), X_140, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'X_test.npz'), X_test, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'X_chopsticks.npz'), X_chopsticks, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'X_all.npz'), X_all, compressed=True)

# after train test split
sparse.save_npz(os.path.join(OUTPUT_DIR, 'X_train.npz'), X_train, compressed=True)
sparse.save_npz(os.path.join(OUTPUT_DIR, 'X_validation.npz'), X_validation, compressed=True)

# save labels
np.save(os.path.join(OUTPUT_DIR, 'y_train.npy'), y_train)
np.save(os.path.join(OUTPUT_DIR, 'y_trainsmall.npy'), y_trainsmall)
np.save(os.path.join(OUTPUT_DIR, 'y_validation.npy'), y_validation)
np.save(os.path.join(OUTPUT_DIR, 'y_test.npy'), y_test)