In [77]:
# simple tweet class

import pandas as pd
import numpy as np
import re
import emoji

class Tweet:
    # initialize lazy-by-default instance
    # id and fit on init optional
    def __init__(self, text, tweetid="Undefined", fit_it=False):
        self.tweetid_ = tweetid
        self.text_ = text
        if fit_it == True:
            self.fit()
        else:
            self.is_fit_ = False

    # fit calls all methods, setting all attributes
    def fit(self):
        if self.is_fit_ == True:  # avoid refitting
            print(f"Tweet \'{self.tweetid_}\' already fit!")
        else:
            self.find_hashtags()  # make a list of hashtags
            self.find_handles()  # ...
            self.find_urls()
            self.find_emojis()
            self.clean()
            self.is_fit_ = True

    # make a list of hashtags
    def find_hashtags(self):
        hashtags = []
        try:
            hashtags.extend(re.findall(r"#\w+", self.text_))
        except:
            pass
        self.hashtags_ = hashtags

    # list of handles
    def find_handles(self):
        handles = []
        try:
            handles.extend(re.findall(r"@\w+", self.text_))
        except:
            pass
        self.handles_ = handles

    # list of urls
    def find_urls(self):
        urls = []
        # source: https://www.geeksforgeeks.org/python-check-url-string/
        urls = re.findall(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", self.text_)
        self.urls_ = urls

    def find_emojis(self):
        emojis = []
        try:
            emojis.extend(re.findall(emoji.get_emoji_regexp(), self.text_))
        except:
            pass
        self.emojis_ = emojis
        self.demojis_ = [emoji.demojize(e) for e in emojis]

    # remove special elements and provide as .text_clean_
    # indicate whether elements are woven into text (just in a word order sense for now)
    def clean(self):
        clean_text = {}
        split_text = dict(enumerate(self.text_.split()))
        special_elements = self.hashtags_ + self.handles_ + self.urls_ + self.emojis_
        for index, term in split_text.items():
            if term not in special_elements:
                clean_text.update({index: term})
        self.text_clean_ = " ".join(clean_text.values())

        # also check if any non-text elements are interwoven with text
        # not ideal to iterate twice but start/end are initially unknown (w/o rewrite of attr_ functions)
        self.is_complex_ = False
        clean_text_start = min(clean_text.keys())
        clean_text_end = max(clean_text.keys())
        for index, term in split_text.items():
            if term in special_elements and clean_text_start > index < clean_text_end:
                self.is_complex_ = True

    # simple status report
    def attributes(self):
        for k,v in sorted(vars(self).items()):
            print(f"{k}: {v}", end='\n')
        print()

In [78]:
# initialize some tweet instances
tweet_1 = Tweet("#This @tweetobject starts and ends with a hastag and handle and has #no @tweetid.")
tweet_2 = Tweet("This tweetobject has #complicatinginternalelements and a @tweetid. It also ends in a url https://t.co/example", tweetid=1234)

# for a touch of intrigue, we'll select an emoji at random
emojis = list(emoji.EMOJI_UNICODE.values())
randint = np.random.randint(0,len(emojis)-1)
tweet_3 = Tweet(f"This tweet has a random emoji: {emojis[randint]} ")

# drop them into a list
tweets = [tweet_1, tweet_2, tweet_3]

In [79]:
# check attributes on unfit tweet objects
# sort here just to compare text versions more easily
for tweet in tweets:
    tweet.attributes()

is_fit_: False
text_: #This @tweetobject starts and ends with a hastag and handle and has #no @tweetid.
tweetid_: Undefined

is_fit_: False
text_: This tweetobject has #complicatinginternalelements and a @tweetid. It also ends in a url https://t.co/example
tweetid_: 1234

is_fit_: False
text_: This tweet has a random emoji: 🗯 
tweetid_: Undefined



In [80]:
# fit tweets
[tweet.fit() for tweet in tweets];

In [81]:
# attempt to re-fit tweets
[tweet.fit() for tweet in tweets];

Tweet 'Undefined' already fit!
Tweet '1234' already fit!
Tweet 'Undefined' already fit!


In [82]:
for tweet in tweets:
    tweet.attributes()

demojis_: []
emojis_: []
handles_: ['@tweetobject', '@tweetid']
hashtags_: ['#This', '#no']
is_complex_: True
is_fit_: True
text_: #This @tweetobject starts and ends with a hastag and handle and has #no @tweetid.
text_clean_: starts and ends with a hastag and handle and has @tweetid.
tweetid_: Undefined
urls_: []

demojis_: []
emojis_: []
handles_: ['@tweetid']
hashtags_: ['#complicatinginternalelements']
is_complex_: False
is_fit_: True
text_: This tweetobject has #complicatinginternalelements and a @tweetid. It also ends in a url https://t.co/example
text_clean_: This tweetobject has and a @tweetid. It also ends in a url
tweetid_: 1234
urls_: ['https://t.co/example']

demojis_: [':right_anger_bubble:']
emojis_: ['🗯']
handles_: []
hashtags_: []
is_complex_: False
is_fit_: True
text_: This tweet has a random emoji: 🗯 
text_clean_: This tweet has a random emoji:
tweetid_: Undefined
urls_: []

