# Scrape Tweets using Scweet

In [1]:
from Scweet.scweet import scrape
from Scweet.user import get_user_information, get_users_following, get_users_followers
# utilities
import re 
import pandas as pd 
import time
# nltk
import nltk 
nltk.download('stopwords')
nltk.download('punkt') 
nltk.download('wordnet') 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odemunoogelohwohor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/odemunoogelohwohor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/odemunoogelohwohor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# run this in terminal
# pip install utils
# enter this directory: /Users/odemunoogelohwohor/Documents/GitHub/CMU_SPR23/11785-deeplearning/IDL-finance-project/Code/Scweet/Scweet
# !python scweet.py --words "\$ZM" --until 2022-12-31 --since 2020-01-01 --limit 100 --interval 1 --display_type Latest --lang="en" --headless False
# check file size: du -h AAPL_2016-01-01_2016-12-31.csv

# Clean the tweets

In [3]:
class TweetCleaner:
  def __init__(self):
    self.stop_words = set(stopwords.words('english'))
    self.emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':(': 'sad', 'XD': 'laughing',
          ':-(': 'sad', ':-<': 'sad', ':P': 'stuck-out-tongue', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          ':/': 'confused', ':|': 'neutral-face', ":'-)": 'sadsmile', "<3": 'love',
          ":'-)": 'tears-of-happiness'}

  def lowercase(self, tweet):
    ''' Each text is converted to lowercase. '''
    return tweet.lower()
  
  def replace_url(self, tweet):
    ''' Links starting with “Http” or “https” or “www” are replaced by “URL” '''
    url_regex = re.compile(r'(http[s]?://|www\.)\S+')
    return url_regex.sub('URL', tweet)

  def replace_emojis(self, tweet):
    '''Replace emojis by using a pre-defined dictionary containing emojis 
      along with their meaning. (e.g.: “:)” to “EMOJIsmile”) '''
    for emoji in self.emojis.keys():
      tweet = tweet.replace(emoji, "EMOJI" + self.emojis[emoji]) 
    return tweet

  def replace_username(self, tweet):
    ''' Replace @Usernames with the word “USER”. (e.g.: “@Kaggle” to “USER”)'''
    user_regex = re.compile(r'@[^\s]+')
    return user_regex.sub('USER', tweet)  

  def remove_nonalpha(self, tweet):
    ''' Replacing characters except Digits and Alphabets with space.'''
    nonalpha_regex = re.compile(r'[^a-zA-Z0-9]')
    return nonalpha_regex.sub(" ", tweet)
  
  def remove_consecutives(self, tweet):
    ''' 3 or more consecutive letters are 
        replaced by two letters. (e.g.: “Heyyyy” to “Heyy”) '''
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    return re.sub(sequencePattern, seqReplacePattern, tweet)

  def remove_stop_short_words(self, tweet):
    ''' English words that do not add much meaning to a sentence are removed
        and Words with a length of less than two are eliminated.'''
    words = nltk.word_tokenize(tweet)
    words = [word for word in words if word not in self.stop_words and len(word) >= 2]
    return ' '.join(words)

  def lemmatize(self, tweet):
    ''' Converting word to its base form. '''
    tweetwords = ''
    for word in tweet.split():
      word = WordNetLemmatizer().lemmatize(word)
      tweetwords += (word+' ')
    return tweetwords

  def clean_onetweet(self, tweet):
    ''' cleans one single tweet '''
    cleaned = self.lowercase(tweet)
    cleaned = self.replace_url(cleaned)
    cleaned = self.replace_emojis(cleaned)
    cleaned = self.replace_username(cleaned)
    cleaned = self.remove_nonalpha(cleaned)
    cleaned = self.remove_consecutives(cleaned)
    cleaned = self.remove_stop_short_words(cleaned)
    cleaned = self.lemmatize(cleaned)
    return cleaned

  def clean_alltweets(self, df):
    ''' cleans all tweets in the dataframe'''
    df['tweets_processed'] = df['tweet'].apply(self.clean_onetweet)
    df = df.drop(columns=['tweet'])
    df = df.rename(columns={'tweets_processed': 'tweet'})
    return df


In [4]:
# method for processing tweets
def process_tweet_dataframe(df):
  tweetCleaner = TweetCleaner()
  
  t = time.time()
  df_processed = tweetCleaner.clean_alltweets(df)
  print(f'Text Preprocessing complete.')
  print(f'Time Taken: {round(time.time()-t)} seconds')
  return df_processed

In [5]:
def get_tweets(scraped_tweets_path, output_file):
    #read the scraped tweets and timestamp from Scweet
    tweets_df = pd.read_csv(scraped_tweets_path)
    #rename columns to match train set
    tweets_df.rename(columns={'Embedded_text':'tweet','Timestamp':'Date'},inplace=True)
    tweets_df = tweets_df[['tweet','Date']]
    #Drop duplicates
    tweets_df.dropna(inplace=True)
    #re-format time to yyyy-mm-dd format
    tweets_df['Date'] = pd.to_datetime(tweets_df['Date'], format='%Y-%m-%d', errors='coerce')
    tweets_df['Date'] = tweets_df['Date'].dt.strftime('%Y-%m-%d')
    tweets_df = pd.DataFrame(data=tweets_df)
    #pre-process the old tweets dataframe and drop duplicates
    tweets_preprocessed = process_tweet_dataframe(tweets_df)
    tweets_preprocessed.drop_duplicates(inplace=True)
    #save the pre-processed old tweets
    tweets_preprocessed.to_csv(output_file)
    return tweets_preprocessed

In [8]:
tweets_preprocessed_df = get_tweets('zoom/tweets_$ZM_2020-01-01_2022-12-31.csv', 'zoom_tweets_preprocessed.csv')

Text Preprocessing complete.
Time Taken: 4 seconds


In [23]:
aapl_df = pd.read_csv('../Scweet/Scweet/outputs/AAPL_2016-01-01_2016-12-31.csv')

In [24]:
aapl_df.head()

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,,,2016-01-01T23:58:53.000Z,,,,,1.0,1.0,[],https://twitter.com/thomashorrobin/status/6830...
1,fr@nk,fr@nk,2016-01-01T23:55:56.000Z,"fr@nk\n@frankkigozi99\n·\nJan 1, 2016",The Best Apple Inc. Headlines in 2015: Apple (...,,,,,[],https://twitter.com/frankkigozi99/status/68307...
2,Dividend Income,@PassDivInc,2016-01-01T23:47:46.000Z,"Dividend Income\n@PassDivInc\n·\nJan 1, 2016",,,,,,[],https://twitter.com/PassDivInc/status/68307215...
3,,,2016-01-01T23:47:25.000Z,,,,,,,['https://pbs.twimg.com/media/CXrCbAVUAAAyeZx?...,https://twitter.com/MacHashNews/status/6830720...
4,,,2016-01-01T23:35:42.000Z,,,,,,,['https://pbs.twimg.com/media/CXq_u68WMAUtU5q?...,https://twitter.com/georgemcgown/status/683069...


In [25]:
aapl_df.rename(columns={'Embedded_text':'tweet','Timestamp':'Date'},inplace=True)

In [26]:
aapl_df.head()

Unnamed: 0,UserScreenName,UserName,Date,Text,tweet,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,,,2016-01-01T23:58:53.000Z,,,,,1.0,1.0,[],https://twitter.com/thomashorrobin/status/6830...
1,fr@nk,fr@nk,2016-01-01T23:55:56.000Z,"fr@nk\n@frankkigozi99\n·\nJan 1, 2016",The Best Apple Inc. Headlines in 2015: Apple (...,,,,,[],https://twitter.com/frankkigozi99/status/68307...
2,Dividend Income,@PassDivInc,2016-01-01T23:47:46.000Z,"Dividend Income\n@PassDivInc\n·\nJan 1, 2016",,,,,,[],https://twitter.com/PassDivInc/status/68307215...
3,,,2016-01-01T23:47:25.000Z,,,,,,,['https://pbs.twimg.com/media/CXrCbAVUAAAyeZx?...,https://twitter.com/MacHashNews/status/6830720...
4,,,2016-01-01T23:35:42.000Z,,,,,,,['https://pbs.twimg.com/media/CXq_u68WMAUtU5q?...,https://twitter.com/georgemcgown/status/683069...


In [27]:
aapl_df = aapl_df[['tweet','Date']]
aapl_df.head()

Unnamed: 0,tweet,Date
0,,2016-01-01T23:58:53.000Z
1,The Best Apple Inc. Headlines in 2015: Apple (...,2016-01-01T23:55:56.000Z
2,,2016-01-01T23:47:46.000Z
3,,2016-01-01T23:47:25.000Z
4,,2016-01-01T23:35:42.000Z


In [28]:
aapl_df.dropna(inplace=True)

In [29]:
aapl_df['Date'] = pd.to_datetime(aapl_df['Date'], format='%Y-%m-%d', errors='coerce')
# aapl_df['Date'] = aapl_df['Date'].dt.strftime('%Y-%m-%d')

In [30]:
aapl_df.head()

Unnamed: 0,tweet,Date
1,The Best Apple Inc. Headlines in 2015: Apple (...,NaT
7,Trending Tech Stocks | AAPL | NEON | PLUG | BB...,NaT
8,"""iPhone fitness apps to help you keep that New...",NaT
9,$aapl selling 1.2 mill iphones in India in F16...,NaT
10,How to trade when a stock you are holding gaps...,NaT
