In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob
from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


nltk.download('wordnet')
nltk.download('omw-1.4')

# Load data as dataframe

In [None]:
def get_df(paths):
    """
    input:
        global paths to csv files
    output:
        dataframe
    """
    df_list = []
    for path in paths:
        df_list.append(pd.read_csv(path))
             
    return pd.concat(df_list, ignore_index=True) 

In [None]:
# TODO: Check what it returns again

def get_missing_replies(df_tweets, df_replies):
    """
    input:
        df_tweets: dataframe of all tweets
        df_replies: dataframe of all replies
    output:
        returns the list of conversations with missing replies.
    """
    tweets_with_reply = df_tweets.loc[df_tweets['reply_count']>0]
    
    all_tweets_conversation_ids = tweets_with_reply['conversation_id'].unique().tolist()
    all_replies_conversation_ids = df_replies['conversation_id'].unique().tolist()
    
    return list(set(all_tweets_conversation_ids) - set(all_replies_conversation_ids))


# Data cleaning / Extract information

In [None]:
def extract_urls(text_string):
    """
    input: 
        string
    output: 
        urls list
    """

    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text_string)
    
    return urls


extract_urls("this suppose to extract urls like https://www.bbc.com/travel/article/20220814-the-floating-homes-of-lake-titicaca")

In [None]:
def tweet_urls_removed(text_string):
    """
    input: 
        string
    output: 
        text_string urls removed
    """
        
    return re.sub(r'http\S+', '', text_string)


tweet_urls_removed("this suppose to extract urls like https://www.bbc.com/travel/article/20220814-the-floating-homes-of-lake-titicaca and retrun the text")

In [None]:
def extract_hashtags(text_string):
    """
    input: 
        text string
    output: 
        list of hastags found in input text
    """
    hastags = "#(\w+)"
    hashtag_list = re.findall(hastags, text_string)
    if len(hashtag_list) > 0:
        return hashtag_list
    else:
        return None
    
    
extract_hashtags("This suppose to #return all #Hashtags in a string")

In [None]:
def extract_mentions(text_string):
    """
    input: 
        text string
    output: 
        list of mentions in input text
    """
    mention = "@(\w+)"
    mention_list = re.findall(mention, text_string)
    return mention_list


extract_mentions('@Niloo try this function maybe @Nilo0 too')

In [None]:
def get_punctuations(text_string):
    """
    input: 
        text string
    output: 
        list of hastags in input text
    """
    puncts = [f'{p}' for p in string.punctuation] + ['...', '/n']
    return [p for p in puncts if p in text_string]


get_punctuations('this is stting!!!.....!?')

In [None]:
def exclamaintion_mark_count(text_string):
    """
    input:
        text_string
    output:
        count exclaimation marks in text_string
    """
    return len([char for char in text_string if char == '!'])
    
    
exclamaintion_mark_count('this is stting.!')

In [None]:
def question_mark_count(text_string):
    """
    input: 
        text_string
    output:
        count question marks in text_string
    """
    return len([char for char in text_string if char == '?'])


question_mark_count('this is stting.!')

In [None]:
def uppercase_words(text_string):
    """
    input: 
        text string
    output: 
        counts number of all caps words
    """
    word_list = text_string.split(" ")
    return [re.findall(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', word)[0] for word in word_list if re.findall(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', word)]


uppercase_words('this counts Number of ALL CAPS words HERE Too')

In [None]:
def count_uppercase_words(text_string):
    """
    input: 
        text string
    output: 
        list of hastags in input text
    """
    return len(uppercase_words(text_string))


count_uppercase_words('this counts Number of ALL CAPS words HERE Too')

In [None]:
def upper_case_pct(text_string):
    """
    input: 
        text_string
    output:
        percentage of upper case letters of input string
    """
    l = sum(1 for char in text_string if char!=" ")
    return round(sum(1 for char in text_string if char.isupper()) / l * 100)


upper_case_pct('this counts Number of ALL CAPS words HERE Too')

In [None]:
def clean_text(text_string):
    """
    input: 
        text string
    output: 
        clean string,
            stopwords removed
            punctuations removed
            urls removed
            convert to lower case characters
    """
    text_string = text_string.replace('\\n', '')
    stopword = stopwords.words('english')
    punct = list(string.punctuation) + ['...']
    
    
    text_string = tweet_urls_removed(text_string)
    
    sentence = re.sub(r'[^\w\s]', '', text_string)
    sentence = [word for word in nltk.word_tokenize(sentence) if word not in punct and word not in stopword]
    sentence = [Word(word).lemmatize().lower() for word in sentence]
    
    return " ".join(sentence)


clean_text("This, is #! CleaRly sth https://www.bbc.com/travel, HOW to have B@d words")

In [None]:
def get_semantic_polarity(text_string):
    """
    input: 
        text_string
    output:
        semantic_score
    """
    blob = TextBlob(text_string)
    score = blob.polarity
    if score > 0:
        return 'positive'
    elif score == 0:
        return 'neutral'
    elif score < 0:
        return 'negative'
    
    
get_semantic_polarity('This is What I try here, amazingly boring')

In [None]:
def extract_emoji(text_string):
    """
    input:
        text_string
    output:
        extracted emojies
    """
    emoji_summary = adv.extract_emoji([text_string])
    return emoji_summary['emoji'][0]




In [None]:
def extract_emoji_group(text_string):
    emoji_summary = adv.extract_emoji([text_string])
    return emoji_summary['top_emoji_sub_groups']



# Reply / Quotes

In [None]:
def get_conversation(cid, df):
    """
    input:
    
    """
    conversation = df.loc[df['referenced_tweets'].str.contains(f'{cid}')]
    return conversation



In [None]:
def get_all_conversation(cid, df):
    all_replies =  df.loc[df['conversation_id'] == cid]
    return all_replies



In [None]:
def get_replies_referenced_tweet_id(input_string):
    return int(ast.literal_eval(input_string)[0]['id'])



In [None]:
def get_count_reply_like_quote(df_row):
    return int(df_row.retweet_count) + int(df_row.like_count) + int(df_row.quote_count) + int(df_row.all_replies_count)



# Update dataframe

In [None]:
def add_hastags(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are lists, each item contains corresponding hashtags in tweets
    """
    return df['text'].apply(lambda x: extract_hashtags(x))



In [None]:
def add_follower_count(df):
    return df['author_id'].apply(lambda x: users.loc[users['id']==x]['followers_count'])



In [None]:
def add_uppercase_count(df):
    """
    input: 
        df: dataframe of tweets
    output:
        panda series, items are number of upper case words in corresponding tweets 
    """
    return df['text'].apply(lambda x: count_uppercase_words(x))



In [None]:
def add_clean_text(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are cleaned tweets
    """
    return df['text'].apply(lambda x: clean_text(x))



In [None]:
def add_engagement_score(df):
    try:
        return df.apply(lambda x: x.total_raction/ (x.followers_count + 1), axis=1)
    except: print('Error: Column "total_raction" does not exist')
    
    

In [None]:
def add_uppercase(df):
    """
    input: 
        df: dataframe
    output:
        panda series, items are list of upper case words in tweets
    """
    return df['text'].apply(lambda x: uppercase_words(x))



In [None]:
def add_polarity(df):
    try:
        return df['clean_text'].apply(lambda x: get_semantic_polarity(x))
    except:
        return print('Error: Column "clean_text" does not exist')
    
    

In [None]:
def add_emoji_list(df):
    return df['text'].apply(lambda x: extract_emoji(x))



In [None]:
def add_count_reaction(df):
    try:
        return df.apply(lambda x: x.like_count + x.quote_count + x.retweet_count + x.all_reply_count, axis=1)
    except: print('Error: Column "all_reply_count" does not exist')
    
    

In [None]:
def add_emoji_group(df):
    return df['text'].apply(lambda x: extract_emoji_group(x))



In [None]:
def add_count_all_reply(df, df_replies):
    return df['conversation_id'].apply(lambda x: len(get_all_reply(x, df_replies)))

