# Tweets preprocessing script

## Configuration

In [None]:
# Configure modules path
SRC_DIRECTORY = "/content/drive/My Drive/project/src"

# Configure script
DATASET_RAW_TWEETS_DIRECTORY = "/content/drive/My Drive/project/dataset/raw/tweets"
DATASET_PREPROCESSED_TWEETS_DIRECTORY = "/content/drive/My Drive/project/dataset/preprocessed/tweets"
PREPROCESS_CONFIG = [
  {
    "raw": DATASET_RAW_TWEETS_DIRECTORY + "/realDonaldTrump.csv",
    "save_eda": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/realDonaldTrump_eda.csv",
    "save_eda_pics": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/realDonaldTrump_eda_pics.csv",
    "save_model": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/realDonaldTrump_model.csv"
  },
  {
    "raw": DATASET_RAW_TWEETS_DIRECTORY + "/POTUS.csv",
    "save_eda": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/POTUS_eda.csv",
    "save_model": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/POTUS_model.csv"
  },
  {
    "raw": DATASET_RAW_TWEETS_DIRECTORY + "/WhiteHouse.csv",
    "save_eda": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/WhiteHouse_eda.csv",
    "save_model": DATASET_PREPROCESSED_TWEETS_DIRECTORY + "/WhiteHouse_model.csv"
  }
]

## Import required modules

In [None]:
import sys
sys.path.insert(0, SRC_DIRECTORY)

import numpy as np
import pandas as pd
import utils.datetime

## Preprocessing functions
### Util functions

In [None]:
def read_tweets_file(file_path):
  df = pd.read_csv(file_path)
  df['to'].fillna('', inplace=True)
  df['geo'].fillna('', inplace=True)
  df['mentions'].fillna('', inplace=True)
  df['hashtags'].fillna('', inplace=True)
  return df

### Preprocessing function for Explorative Data Analysis

Expected columns in DataFrame:

[(0, 'date'), (1, 'username'), (2, 'to'), (3, 'replies'), (4, 'retweets'), (5, 'favorites'), (6, 'text'), (7, 'geo'), (8, 'mentions'), (9, 'hashtags'), (10, 'id'), (11, 'permalink')]

In [None]:
import re
def filter_tweet_links_mentions_hashtags(tweet_text):
  return re.sub(
    r"@\S+|https?://\S+|pic.twitter.com\S+|…|#\S+", "", tweet_text
  ).strip()

def filter_tweet_links(tweet_text):
  return re.sub(
    r"https?://\S+|pic.twitter.com\S+|…|", "", tweet_text
  ).strip()

def remove_bad_tokens(tweet_text):
  return tweet_text.replace("pic", "").replace("http", "").replace("https", "")

In [None]:
def preprocess_raw_tweets_eda(tweets_df):
  tweets = tweets_df.values.copy()

  filtered_tweets = []

  for i in range(tweets.shape[0]):
    
    dt = utils.datetime.parse_datetime_str(
      tweets[i, 0], "%Y-%m-%d %H:%M:%S", "+0000"
    )
    dt = utils.datetime.convert_datetime_timezone(dt, "America/New_York")

    tweet_text_filtered = filter_tweet_links_mentions_hashtags(tweets[i, 6])

    if len(tweet_text_filtered) > 0:
      filtered_tweets.append([
        dt.year,
        dt.month,
        dt.day,
        dt.hour,
        dt.minute,
        # Text
        tweet_text_filtered,
        # Reply to (empty string if it's original tweet)
        tweets[i, 2],
        # Replies
        tweets[i, 3],
        # Retweets
        tweets[i, 4],
        # Favorites
        tweets[i, 5],
        # Mentions
        remove_bad_tokens(tweets[i, 8]),
        # Hashtags
        remove_bad_tokens(tweets[i, 9])
      ])

  return pd.DataFrame(np.asarray(filtered_tweets), columns = [
    'year', 'month', 'day', 'hour', 'minute', 'text',
    'reply_to', 'replies', 'retweets', 'favorites',
    'mentions', 'hashtags'
  ])

def extract_pics(tweets_df):
  tweets = tweets_df.values.copy()

  pics = []

  for i in range(tweets.shape[0]):
    if "pic.twitter.com" in tweets[i, 6]:
      pics.append([
        # Text
        tweets[i, 6],
        # Favorites
        tweets[i, 5]  
      ])
  
  return pd.DataFrame(np.asarray(pics), columns = [
    "text",
    "favorites"
  ])

### Preprocessing data for model

In [None]:
def preprocess_raw_tweets_model(tweets_df):
  tweets = tweets_df.values.copy()
  print(tweets.shape)
  
  tweets = tweets[tweets[:, 2] == ""]
  print(tweets.shape)

  filtered_tweets = []

  for i in range(tweets.shape[0]):
    
    dt = utils.datetime.parse_datetime_str(
      tweets[i, 0], "%Y-%m-%d %H:%M:%S", "+0000"
    )
    timestamp = utils.datetime.datetime_to_timestamp(dt)

    tweet_text_filtered = filter_tweet_links(tweets[i, 6])

    if len(tweet_text_filtered) > 0:
      filtered_tweets.append([
        # Timestamp
        timestamp,
        # Text
        tweet_text_filtered
      ])

  return pd.DataFrame(np.asarray(filtered_tweets), columns = [
    "timestamp",
    "text"
  ])

## Preprocess the data

### Explorative Data Analysis

In [None]:
for p in PREPROCESS_CONFIG:
  print(p['raw'])

  tweets_df = read_tweets_file(p['raw'])
  tweets = preprocess_raw_tweets_eda(tweets_df)

  print("tweets")
  print(tweets.describe())

  tweets.to_csv(p["save_eda"])

  if "save_eda_pics" in p:
    pics = extract_pics(tweets_df)
    pics.to_csv(p["save_eda_pics"])

    print("pics")
    print(pics.describe())
    
  print()

### Model

In [None]:
for p in PREPROCESS_CONFIG:
  print(p['raw'])
  
  tweets_df = read_tweets_file(p['raw'])
  tweets = preprocess_raw_tweets_model(tweets_df)

  print("tweets")
  print(tweets.describe())

  tweets.to_csv(p["save_model"], index=False)

  print()