In [138]:
# simple tweet class

import pandas as pd
import numpy as np
import re
import emoji

class Tweet:
    # initialize lazy-by-default instance
    # id and fit on init optional
    def __init__(self, text, tweetid="Undefined", fit_it=False):
        self.tweetid_ = tweetid
        self.text_ = text
        if fit_it == True:
            self.fit()
        else:
            self.is_fit_ = False
        
    # fit calls all methods, setting all attributes
    def fit(self):    
        if self.is_fit_ == True: # avoid refitting 
            print("Already fit!")
        else:
            self.find_hashtags() # make a list of hashtags
            self.find_handles()  #...
            self.find_urls()
            self.find_emojis()
            self.clean()
            self.is_fit_ = True

        # self.is_retweet() # add later
    
    # make a list of hashtags
    def find_hashtags(self):
        hashtags = []
        try:
            hashtags.extend( re.findall(r"#\w+", self.text_) )
        except:
            pass
        self.hashtags_ = hashtags
    
    # list of handles
    def find_handles(self):
        handles = []
        try:
            handles.extend( re.findall(r"@\w+", self.text_) )
        except:
            pass
        self.handles_ = handles
    
    # list of urls
    def find_urls(self):
        # regex from https://www.geeksforgeeks.org/python-check-url-string/
        urls = []
        try:
            handles.extend( re.findall(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", self.text_) )
        except:
            pass
        self.urls_ = urls
        
    def find_emojis(self):
        emojis = []
        try:
            emojis.extend( re.findall( emoji.get_emoji_regexp(), self.text_) )
        except:
            pass
        self.emojis_ = emojis
        self.demojis_ = [emoji.demojize(e) for e in emojis]
    
    # remove special elements and provide as .text_clean_
    # indicate whether elements are woven into text (just in a word order sense for now)
    def clean(self):
        clean_text = {}
        split_text = dict(enumerate(self.text_.split()))
        non_text = self.hashtags_ + self.handles_ + self.urls_ + self.emojis_
        for index, term in split_text.items():
            if term not in non_text:
                clean_text.update({index:term})
        self.text_clean_ = " ".join(clean_text.values())
        
        # also check if any non-text elements are interwoven with text
        # not ideal to iterate twice but start/end are initially unknown (w/o rewrite of attr_ functions)
        self.is_complex_ = False
        clean_text_start = min(clean_text.keys())
        clean_text_end = max(clean_text.keys())
        for index, term in split_text.items():
            if term in non_text and clean_text_start > index < clean_text_end:
                self.is_complex_ = True                

In [139]:
# initialize some tweet instances
# for a touch of intrigue, we'll select an emoji at random
tweet_1 = Tweet("This tweet object has no tweetid.")
tweet_2 = Tweet("This tweet object has a tweetid", tweetid=1234)
tweet_3 = Tweet("#Thistweet starts with a hashgtag and ends with a https://t.co.qwerty")
tweet_4 = Tweet("This tweet has a #hashtagwithintext and @handle as well")
tweet_5 = Tweet(f"This tweet has a random emoji: {list(emoji.EMOJI_UNICODE.values())[np.random.randint(0,1000)]} ")

In [140]:
# drop them into a list
tweets = [tweet_1, tweet_2, tweet_3, tweet_4, tweet_5]

In [141]:
# check attributes on unfit tweet objects
# sort here just to compare text versions more easily
[sorted(vars(tweet).items()) for tweet in tweets]

[[('is_fit_', False),
  ('text_', 'This tweet object has no tweetid.'),
  ('tweetid_', 'Undefined')],
 [('is_fit_', False),
  ('text_', 'This tweet object has a tweetid'),
  ('tweetid_', 1234)],
 [('is_fit_', False),
  ('text_',
   '#Thistweet starts with a hashgtag and ends with a https://t.co.qwerty'),
  ('tweetid_', 'Undefined')],
 [('is_fit_', False),
  ('text_', 'This tweet has a #hashtagwithintext and @handle as well'),
  ('tweetid_', 'Undefined')],
 [('is_fit_', False),
  ('text_', 'This tweet has a random emoji: 👶🏽 '),
  ('tweetid_', 'Undefined')]]

In [142]:
# fit tweets
[tweet.fit() for tweet in tweets]

[None, None, None, None, None]

In [143]:
# check attributes on fitted tweets
[sorted(vars(tweet).items()) for tweet in tweets]

[[('demojis_', []),
  ('emojis_', []),
  ('handles_', []),
  ('hashtags_', []),
  ('is_complex_', False),
  ('is_fit_', True),
  ('text_', 'This tweet object has no tweetid.'),
  ('text_clean_', 'This tweet object has no tweetid.'),
  ('tweetid_', 'Undefined'),
  ('urls_', [])],
 [('demojis_', []),
  ('emojis_', []),
  ('handles_', []),
  ('hashtags_', []),
  ('is_complex_', False),
  ('is_fit_', True),
  ('text_', 'This tweet object has a tweetid'),
  ('text_clean_', 'This tweet object has a tweetid'),
  ('tweetid_', 1234),
  ('urls_', [])],
 [('demojis_', []),
  ('emojis_', []),
  ('handles_', []),
  ('hashtags_', ['#Thistweet']),
  ('is_complex_', True),
  ('is_fit_', True),
  ('text_',
   '#Thistweet starts with a hashgtag and ends with a https://t.co.qwerty'),
  ('text_clean_',
   'starts with a hashgtag and ends with a https://t.co.qwerty'),
  ('tweetid_', 'Undefined'),
  ('urls_', [])],
 [('demojis_', []),
  ('emojis_', []),
  ('handles_', ['@handle']),
  ('hashtags_', ['#hashta

In [144]:
[tweet.is_complex_ for tweet in tweets]

[False, False, True, False, False]

In [145]:
[tweet.urls_ for tweet in tweets]

[[], [], [], [], []]