# 1.1 Imports

In [1176]:
import pandas as pd
import re
import ast
from string import punctuation
import emoji
from googletrans import Translator, LANGUAGES
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

In [1177]:
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")

In [1178]:
from transformers import pipeline
sa = pipeline('text-classification', model='CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment')
sentences = 'أنا لست بخير'
sa(sentences)


[{'label': 'negative', 'score': 0.613193690776825}]

In [1179]:
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

# 1.2 Constants

In [1180]:
TOKENS = ['EMOJI', 'MENTION', 'HASHTAG', 'URL']

In [1181]:
CONTRACTIONS = [
    ("dont", "do not"), ("didnt", "did not"), ("im", "i am "), ("arent", "are not"),
    ("isnt", "is not"), ("ive", "i have"), ("theyd", "they would"), ("id", "i would"),
    ("it's", 'it is'), ('i’m', 'i am'), ('don’t', 'do not'), ("cant", "cannot"),
    ("wont", "will not"), ("wouldnt", "would not"), ("hasnt", "has not"),
    ("havent", "have not"), ("hadnt", "had not"), ("don't", "do not"),
    ("didn’t", "did not"), ("i'm", "i am"), ("i’m", "i am"), ("aren't", "are not"),
    ("aren’t", "are not"), ("isn't", "is not"), ("isn’t", "is not"), ("i’ve", "i have"),
    ("they'd", "they would"), ("they’d", "they would"), ("i'd", "i would"),
    ("i’d", "i would"), ('it’s', 'it is'), ("can't", "cannot"), ("can’t", "cannot"),
    ("won't", "will not"), ("won’t", "will not"), ("wouldn't", "would not"),
    ("wouldn’t", "would not"), ("hasn't", "has not"), ("hasn’t", "has not"),
    ("haven't", "have not"), ("haven’t", "have not"), ("hadn't", "had not"),
    ("hadn’t", "had not"), ('needn\'t', 'need not')
]

NON_SPACED_CONTRACTIONS = [('\t', ' '), ('\n', ' '), ('-', ' '),
('–', ' '), ('…', ' '), ('—', ' ')]

NON_SPACED_CONTRACTIONS = dict(NON_SPACED_CONTRACTIONS)
CONTRACTIONS = dict(CONTRACTIONS)

In [1182]:
TEXTING_LANGUAGE = {
    '1st': 'first',
    '2nd': 'second',
    '3rd': 'third',
    'sec': 'second',
    'secs': 'seconds',
    'hrs': 'hours',
    'hr': 'hour',
    'mins': 'minutes',
    'min': 'minute',
    '24/7': 'twenty four seven',
    'afaik': '[ABBREVIATION: as far as I know]',
    'b4': '[ABBREVIATION: before]',
    'bday': '[ABBREVIATION: birthday]',
    'bff': '[ABBREVIATION: best friends forever]',
    'brb': '[ABBREVIATION: be right back]',
    'btw': '[ABBREVIATION: by the way]',
    'cul8r': '[ABBREVIATION: see you later]',
    'fomo': '[ABBREVIATION: fear of missing out]',
    'gtg': '[ABBREVIATION: got to go]',
    'hmu': '[ABBREVIATION: hit me up]',
    'idr': '[ABBREVIATION: I do not remember]',
    'idk': '[ABBREVIATION: I do not know]',
    'idrk': '[ABBREVIATION: I do not really know]',
    'idc': '[ABBREVIATION: I do not care]',
    'idts': '[ABBERVIATION: I do not think so]',
    'idt': '[ABBREVIATION: I do not think]',
    'ily': '[ABBREVIATION: I love you]',
    'ily2': '[ABBREVIATION: I love you too]',
    'ilysm': '[ABBREVIATION: I love you so much]',
    'imho': '[ABBREVIATION: in my humble opinion]',
    'imo': '[ABBREVIATION: in my opinion]',
    'irl': '[ABBREVIATION: in real life]',
    'jk': '[ABBREVIATION: just kidding]',
    'kms': '[ABBREVIATION: kill myself]',
    'kys': '[ABBREVIATION: kill yourself]',
    'lol': '[ABBREVIATION: laughing out loud]',
    'lmfao': '[ABBREVIATION: laughing my a** off]',
    'lmk': '[ABBREVIATION: let me know]',
    'mcm': '[ABBREVIATION: man crush Monday]',
    'nvm': '[ABBREVIATION: never mind]',
    'ok': '[ABBREVIATION: okay]',
    'omg': '[ABBREVIATION: oh my god]',
    'pls': '[ABBREVIATION: please]',
    'plz': '[ABBREVIATION: please]',
    'rofl': '[ABBREVIATION: rolling on the floor laughing]',
    'smh': '[ABBREVIATION: shaking my head]',
    'tbh': '[ABBREVIATION: to be honest]',
    'tbf': '[ABBREVIATION: to be fair]',
    'tmrrw': '[ABBREVIATION: tomorrow]',
    'tmrw': '[ABBREVIATION: tomorrow]',
    'tbt': '[ABBREVIATION: throwback Thursday]',
    'thx': '[ABBREVIATION: thanks]',
    'ttyl': '[ABBREVIATION: talk to you later]',
    'tmi': '[ABBREVIATION: too much information]',
    'tyt': '[ABBREVIATION: take your time]',
    'wtf': '[ABBREVIATION: what the f***]',
    'wth': '[ABBREVIATION: what the hell]',
    'wyd': '[ABBREVIATION: what are you doing]',
    'yw': '[ABBREVIATION: you are welcome]',
    '2moro': '[ABBREVIATION: tomorrow]',
    '2nite': '[ABBREVIATION: tonight]',
    'g2g': '[ABBREVIATION: got to go]',
    'hbd': '[ABBREVIATION: happy birthday]',
    'fr': '[ABBREVIATION: for real]',
    'uni': '[ABBREVIATION: university]',
    'wcw': '[ABBREVIATION: woman crush Wednesday]',
    'srsly': '[ABBREVIATION: seriously]',
    'nbd': '[ABBREVIATION: no big deal]',
    'roflmao': '[ABBREVIATION: rolling on the floor laughing my a** off]',
    'sry': '[ABBREVIATION: sorry]',
    'tgif': '[ABBREVIATION: thank goodness it\'s Friday]',
    'wbu': '[ABBREVIATION: what about you]',
}

In [1183]:
STOPWORDS = [
    'like', 'she', 'he', 'him', 'her', 'the', 'this', 'that', 'those',
    'and', 'or', 'but', 'not', 'in', 'on', 'at', 'with', 'by', 'for', 'to',
    'of', 'an', 'is', 'was', 'were', 'am', 'are', 'be', 'been', 'being',
    'it', 'its', 'you', 'your', 'yours', 'they', 'them', 'their', 'theirs',
    'we', 'us', 'our', 'ours', 'me', 'my', 'mine', 'myself', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'itself', 'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
    'out', 'enta', 'enty', ' a ', 'zy', 'zay', 'keda', 'to', 'from', 'wallah',
    'wallahy', 'wallahi', 'wehyat', 'so', 'for', 'yet', 'tab', 'el', 'al', 'bas',
    'ashan', '3ashan', 'mashi', 'mashy', 'a', 'got', 'get', 'went', 'goes', 'go', 'i',
    'before', 'after', 'because', 'cause', 'cuz', 'cos', 'howa', 'heya', 'hwa', 'homa', 'hya',
    'tamam', 'tayeb', 'او', 'أو', 'و'
]

In [1184]:
UNNECESSARY_CHARACTERS = ['@', '#', '“', '‘', '”','’', '...', '—', '–','؟','…', '$', '%', '^', '&', '*', 
                          '(', ')', '_', '+', '=', '{', '}', '[', ']', '\\', '|', ':', ';', '"', "'", 
                          '<', '>', ',', '.', '?', '/', '؛', ' ،', '≠', '≤', '≥', '«', '»', 'ـ', '٪']
for character in punctuation:
    UNNECESSARY_CHARACTERS.append(character)
    
UNNECESSARY_CHARACTERS = set(UNNECESSARY_CHARACTERS)

# 2. Read Data

In [1185]:
circle_tweets_df = pd.read_csv('circle_tweet_content.csv')
likes_df = pd.read_csv('likes.csv')
tweets_df = pd.read_csv('tweet_content.csv')

# 3. Defining Functions

In [1186]:
def convert_string_to_list(input_string):
    '''
    This function takes in a string and returns a list.
    ---
    Parameters:
        input_string: str
    ---
    Returns:
        output_list: list
    '''
    # Remove square brackets and single quotes from the input string
    cleaned_string = input_string.replace("[", "").replace("]", "").replace("'", "").lower()

    # Split the cleaned string into a list using ', ' as the separator
    output_list = cleaned_string.split(', ')
    
    if len(output_list) == 1 and output_list[0] == '':
        output_list = []

    return output_list

In [1187]:
def find_url_add_token(data):
    '''
    This function takes in a dataframe and returns a dataframe with the url token added.
    ---
    Parameters:
        data: pandas dataframe
    ---
    Returns:
        df: pandas dataframe
    '''

    data['url'] = None
    for i, row in data.iterrows():
        content = row['content']
        url = re.findall(r'(https?://\S+)', content)
        if len(url) != 0:
            data.at[i, 'url'] = url[0]
            content = content.replace(url[0], ' [URL] ')
            data.at[i, 'content'] = content

In [1188]:
def get_tweets(df):
    '''
    This function takes in a dataframe and returns a dataframe with only tweets.
    ---
    Parameters:
        df: pandas dataframe  
    ---
    Returns:
        filtered_df: pandas dataframe
    '''

    # Assuming 'tweet/retweet' is the column name
    filtered_df = df[df['tweet/retweet'] == 0].copy()

    return filtered_df

In [1189]:
def get_retweets(df):
    '''
    This function takes in a dataframe and returns a dataframe with only tweets.
    ---
    Parameters:
        df: pandas dataframe  
    ---
    Returns:
        filtered_df: pandas dataframe
    '''

    # Assuming 'tweet/retweet' is the column name
    filtered_df = df[df['tweet/retweet'] == 1].copy()

    return filtered_df

In [1190]:
def get_quotes(df):
    '''
    This function takes in a dataframe and returns a dataframe with only tweets.
    ---
    Parameters:
        df: pandas dataframe  
    ---
    Returns:
        filtered_df: pandas dataframe
    '''

    # Assuming 'tweet/retweet' is the column name
    filtered_df = df[df['is_quote'] == 1].copy()

    return filtered_df

In [1191]:
def convert_string_to_list_of_dicts(s):
    try:
        # Using ast.literal_eval to safely evaluate the string as a Python expression
        list_of_dicts = ast.literal_eval(s)

        # Ensure the result is a list of dictionaries
        if not isinstance(list_of_dicts, list) or not all(isinstance(d, dict) for d in list_of_dicts):
            raise ValueError("Input is not a valid string representation of a list of dictionaries")

        return list_of_dicts
    except (SyntaxError, ValueError) as e:
        # Handle exceptions if the string cannot be evaluated or converted
        return []

In [1192]:
def remove_hashtags(data):
    '''
    This function takes in a dataframe and returns a dataframe with the hashtags removed.
    ---
    Parameters:
        df: pandas dataframe
    ---
    Returns:
        df: pandas dataframe
    '''

    for i, row in data.iterrows():
        if len(data.at[i, 'hashtags']) != 0:
            hashtags = data.at[i, 'hashtags']
            for hashtag in hashtags:
                hashtag = '#' + hashtag['text']
                data.at[i, 'content'] = data.at[i, 'content'].replace(hashtag, '[HASHTAG]')    

In [1193]:
def count_mentions(data):
    '''
    This function takes in a dataframe and returns a dataframe with the frequency of mentions.
    ---
    Parameters:
        data: pandas dataframe
    ---
    Returns:
        df: pandas dataframe
    '''

    data['mentions count'] = None
    for i, row in data.iterrows():
        mentions = row['mentions']
        data.at[i, 'mentions count'] = len(mentions)

In [1194]:
def count_hashtags(data):
    '''
    This function takes in a dataframe and returns a dataframe with the frequency of hashtags.
    ---
    Parameters:
        data: pandas dataframe
    ---
    Returns:
        df: pandas dataframe
    '''

    data['hashtags count'] = None
    for i, row in data.iterrows():
        hashtags = row['hashtags']
        data.at[i, 'hashtags count'] = len(hashtags)

In [1195]:
def remove_mentions(data):
    '''
    This function takes in a dataframe and returns a dataframe with the mentions removed.
    ---
    Parameters:
        data: pandas dataframe
    ---
    Returns:
        df: pandas dataframe
    '''
    for i, row in data.iterrows():
        mentions = row['mentions']
        if len(mentions) != 0:
            for mention in mentions:
                mention = '@' + mention
                data.at[i, 'content'] = data.at[i, 'content'].replace(mention, '[MENTION]')
        data.at[i, 'mentions'] = mentions

In [1196]:
def identify_replies(data):
    '''
    This function takes in a dataframe and modifies it to identify replies. 1 inidicates that the tweet is a reply, 0 indicates that the tweet is not a reply.
    ---
    Parameters:
        data: pandas dataframe
    '''
    data['Reply'] = None
    for index, row in data.iterrows():
        if row['content'].find('[MENTION]') == 0:
            data.at[index, 'Reply'] = 1 # This is a reply
        else:
            data.at[index, 'Reply'] = 0

In [1197]:
def process_word(word):
    if word.strip().isdigit():
        return " "
    # Use regex to find consecutive double or more letters at the end of the word
    match = re.search(r'(\w*?)(\w)\2+$', word)

    if match:
        # Replace the matched part with a single occurrence of the letter
        modified_word = match.group(1) + match.group(2)
        return modified_word
    else:
        # If no consecutive double or more letters found, return the original word
        return word

In [1198]:
def remove_double_letters(sentence):
    # Split the sentence into a list of words
    words = sentence.split()

    # Process each word in the list
    modified_words = [process_word(word) for word in words]
    
    emojis = []
    
    for i in range(len(modified_words)):
        word = modified_words[i]
        emoji_occ = emoji.emoji_list(word)
        if len(emoji_occ) != 0:
            for em in emoji_occ:
                emojis.append(em['emoji'])
                word = word[0:em['match_start']] + ' [EMOJI] ' + word[em['match_end']:]
                modified_words[i] = word
                

    modified_words = [word for word in modified_words if not emoji.is_emoji(word)]

    # Join the modified words back into a sentence
    modified_sentence = ' '.join(modified_words)

    return modified_sentence, emojis


In [1199]:
def clean_text(data: pd.DataFrame) -> pd.DataFrame:
    # Define a function to replace contractions in a string
    def replace_everything(s):
        
        if s.startswith('rt'):
            s = s.replace('rt', ' ')
            
        for contraction, replacement in NON_SPACED_CONTRACTIONS.items():
            s = s.replace(contraction, replacement)
            
        for contraction, replacement in CONTRACTIONS.items():
            s = ' ' + s + ' '
            s = s.replace(' ' + contraction + ' ', ' ' + replacement + ' ')
            s = s.strip()

        for i in UNNECESSARY_CHARACTERS:
            s = s.replace(i, ' ')
            
        s, emojis = remove_double_letters(s)
        
        s = ' ' + s + ' '
        
        for abb in TEXTING_LANGUAGE:
            s = s.replace(' ' + abb + ' ', ' ' + TEXTING_LANGUAGE[abb] + ' ')
            
        for word in STOPWORDS:
            if word in s:
                s = s.replace(' ' + word + ' ' , ' ')
        
        while  '  ' in s:
            s = s.replace('  ', ' ')
        
        s = s.strip()
        
        return s, emojis
    
    # Apply the function to the 'content' column of the dataframe
    content = [replace_everything(tweet)[0] for tweet in list(data['content'])]
    emojis = [replace_everything(tweet)[1] for tweet in list(data['content'])]
    data['content'] = content
    data['emojis'] = emojis

    return data


# 4.1 Cleaning

In [1201]:
circle_tweets_df['content'] = circle_tweets_df['content'].str.lower()
circle_tweets_df['mentions'] = circle_tweets_df['mentions'].str.lower().apply(convert_string_to_list)
tweets_df['content'] = tweets_df['content'].str.lower()
tweets_df['mentions'] = tweets_df['mentions'].str.lower().apply(convert_string_to_list)
circle_tweets_df['hashtags'] = circle_tweets_df['hashtags'].apply(convert_string_to_list)
tweets_df['hashtags'] = tweets_df['hashtags'].str.lower().apply(convert_string_to_list_of_dicts)
tweets_df['date'] = pd.to_datetime(tweets_df['date'], format='%Y-%m-%d')
circle_tweets_df['date'] = pd.to_datetime(circle_tweets_df['date'], format='%Y-%m-%d')
tweets_df['hour'] = tweets_df['hour'].astype(int)
circle_tweets_df['hour'] = circle_tweets_df['hour'].astype(int)
circle_tweets_df.drop(columns=['hashtags'], inplace=True)

In [1202]:
find_url_add_token(circle_tweets_df)
find_url_add_token(tweets_df)

In [1203]:
remove_hashtags(tweets_df)

In [1204]:
count_hashtags(tweets_df)
count_mentions(tweets_df)
count_mentions(circle_tweets_df)

In [1205]:
remove_mentions(data=tweets_df)
remove_mentions(data=circle_tweets_df)
identify_replies(data=tweets_df)
identify_replies(data=circle_tweets_df)

In [1206]:
tweets_df = clean_text(tweets_df)
circle_tweets_df = clean_text(circle_tweets_df)

In [1207]:
tweets_df['content'] = tweets_df['content'].fillna('').astype(str)
circle_tweets_df['content'] = circle_tweets_df['content'].fillna('').astype(str)

In [1218]:
emojis_tweets_df = tweets_df.explode('emojis')
emojis_tweets_df = emojis_tweets_df['emojis'].value_counts().reset_index()
emojis_circle_df = circle_tweets_df.explode('emojis')
emojis_circle_df = emojis_circle_df['emojis'].value_counts().reset_index()

In [1222]:
emojis_tweets_df.to_csv('emojis_tweets.csv', index=False)

In [1223]:
tweets_only = get_tweets(df=tweets_df)
retweets_only = get_retweets(df=tweets_df)
quotes_only = get_quotes(df=tweets_df)
circle_quotes = get_quotes(df=circle_tweets_df)
tweet_quotes = get_quotes(df=tweets_only)

In [1230]:
# Count the number of replies versus normal tweets
reply_counts = tweets_only['Reply'].value_counts().reset_index()
reply_counts.columns = ['Type', 'Count']

In [1231]:
reply_counts.at[0, 'Type'] = 'Normal Tweets'
reply_counts.at[1, 'Type'] = 'Replies'

In [1233]:
reply_counts.to_csv('reply_counts_tweets.csv', index=False)

In [1234]:
TWEET_TYPES = ['Original tweet', 'Reply', 'Quote', 'Retweet', 'Circle tweet', 'Circle quote']
TWEET_COUNTS = [int(reply_counts.at[0,'Count']), int(reply_counts.at[1,'Count']), len(quotes_only), len(retweets_only), len(circle_tweets_df) -  len(circle_quotes), len(circle_quotes)]

tweet_counts_df = pd.DataFrame({'Type': TWEET_TYPES, 'Count': TWEET_COUNTS})

tweet_counts_df.to_csv('tweet_counts.csv', index=False)

In [1235]:
tweet_counts_df

Unnamed: 0,Type,Count
0,Original tweet,5821
1,Reply,4993
2,Quote,670
3,Retweet,2069
4,Circle tweet,1005
5,Circle quote,108
