# Preprocess

In [None]:
!pip install sentence-transformers umap nltk emoji==0.6.0

In [24]:
import numpy as np
import pandas as pd
import torch
import pickle
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from Preprocess import discrete_hour, normalize_tweet, encode_bertweet, encode_user_description, encode_user_covariates

tqdm.pandas()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
domains = ['music', 'politics', 'health', 'technology']
for domain in domains:
    print(f'Preprocessing {domain} data . . .')
    df = pd.read_pickle(f'pickles/{domain}_tweets_filtered.pkl')
    df['normalized_text'] = df['text'].apply(normalize_tweet)
    X_text = df['normalized_text'].progress_apply(encode_bertweet)
    X_user_description = df['user_description'].progress_apply(encode_user_description)
    X_user_covariates = df.progress_apply(lambda x: encode_user_covariates(x), axis=1)
    T = df['scaled_favorite_count']
    Y = df['hour'].apply(discrete_hour)
    
    X_text = np.vstack(X_text)
    X_user_description = np.vstack(X_user_description)
    X_user_covariates = np.vstack(X_user_covariates)
    
    with open(f'pickles/encodings/{domain}/X_text.pkl', 'wb') as f: pickle.dump(X_text, f)
    with open(f'pickles/encodings/{domain}/X_user_description.pkl', 'wb') as f: pickle.dump(X_user_description, f)
    with open(f'pickles/encodings/{domain}/X_user_covariates.pkl', 'wb') as f: pickle.dump(X_user_covariates, f)
    with open(f'pickles/encodings/{domain}/T.pkl', 'wb') as f: pickle.dump(T, f)
    with open(f'pickles/encodings/{domain}/Y.pkl', 'wb') as f: pickle.dump(Y, f)

Preprocessing music data . . .


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 9035/9035 [32:29<00:00,  4.64it/s]
 42%|████████████████████████████████████████████                                                             | 3792/9035 [26:54<22:33,  3.87it/s]

In [None]:
with open(f'pickles/encodings/music/X_text.pkl', 'rb') as f:
    X = pickle.load(f)
X.shape