In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity



nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


# stemmer = nltk.SnowballStemmer("english")

# Data cleaning / Extract information

In [None]:
def extract_urls(text_string):
    """
    input: 
        string
    output: 
        urls list
    """

    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text_string)
    
    return urls


extract_urls("this suppose to extract urls like https://www.bbc.com/travel/article/20220814-the-floating-homes-of-lake-titicaca")

In [None]:
def tweet_urls_removed(text_string):
    """
    input: 
        string
    output: 
        text_string urls removed
    """
        
    return re.sub(r'http\S+', '', text_string)


tweet_urls_removed("this suppose to extract urls like https://www.bbc.com/travel/article/20220814-the-floating-homes-of-lake-titicaca and retrun the text")

In [9]:
def extract_hashtags(text_string):
    """
    input: 
        text string
    output: 
        list of hastags found in input text
    """
    hashtags = "#(\w+)"
    return re.findall(hashtags, text_string)
    # hashtag_list = re.findall(hastags, text_string)
    # if len(hashtag_list) > 0:
    #     return hashtag_list
    # else:
    #     return None
    
    
extract_hashtags("This suppose to return all  in a string")

[]

In [None]:
def mention_count(text_string):
    """
    input: 
        text string
    output: 
        list of mentions in input text
    """
    mention = "@(\w+)"
    mention_list = re.findall(mention, text_string)
    return len(mention_list)


mention_count('@Niloo try this function maybe @Nilo0 too')

In [None]:
def tweet_remove_mention(text_string):
    """
    input:
        text_string
    output:
        text, mentions are removed from the text
    """
    sentence = text_string.split()
    mention = "@(\w+)"
    return re.sub(mention, '', text_string)



tweet_remove_mention('@Niloo try this function maybe @Nilo0 too')

In [None]:
def get_punctuations(text_string):
    """
    input: 
        text string
    output: 
        list of hastags in input text
    """
    puncts = [f'{p}' for p in string.punctuation] + ['...', '/n']
    return [p for p in puncts if p in text_string]


get_punctuations('this is stting!!!.....!?')

In [None]:
def exclamaintion_mark_count(text_string):
    """
    input:
        text_string
    output:
        count exclaimation marks in text_string
    """
    return len([char for char in text_string if char == '!'])
    
    
exclamaintion_mark_count('this is stting.!')

In [None]:
def question_mark_count(text_string):
    """
    input: 
        text_string
    output:
        count question marks in text_string
    """
    return len([char for char in text_string if char == '?'])


question_mark_count('this is stting.!')

In [None]:
def uppercase_words(text_string):
    """
    input: 
        text string
    output: 
        counts number of all caps words
    """
    word_list = text_string.split(" ")
    return [re.findall(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', word)[0] for word in word_list if re.findall(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', word)]


uppercase_words('this counts Number of ALL CAPS words HERE Too')

In [None]:
def count_uppercase_words(text_string):
    """
    input: 
        text string
    output: 
        list of hastags in input text
    """
    return len(uppercase_words(text_string))


count_uppercase_words('this counts Number of ALL CAPS words HERE Too')

In [None]:
def upper_case_pct(text_string):
    """
    input: 
        text_string
    output:
        percentage of upper case letters of input string
    """
    l = sum(1 for char in text_string if char!=" ")
    return round(sum(1 for char in text_string if char.isupper()) / l * 100)


upper_case_pct('this counts Number of ALL CAPS words HERE Too')

In [None]:
def count_emojis(text_string):
    """
    input:
        text_string
    output:
        integer, emoji count
    """
    emoji_summary = adv.extract_emoji([text_string])
    return emoji_summary['emoji_counts'][0]

count_emojis('The Global Warming for ‚ù§Ô∏è‚Äçüî•‚ù§Ô∏è‚Äçüî• Warming  Fraud on society by Currupt Global Agencies. A multi part series of 3 minute explanations of the Net Zero Hoax.It stops ‚úãÔ∏è when we all say NO.@GBNEWS@PaulDuddridge @MarkSteynOnlinehttps://t.co/SFV2VLjypD')

In [None]:
def remove_emoji(text_string):
    sentence = text_string.split(" ")
    return ' '.join([word for word in sentence if len(adv.extract_emoji([word])['emoji_flat_text']) == 0])

remove_emoji("This, is #!  ‚ù§Ô∏è‚Äçüî•‚ù§Ô∏è‚Äçüî•  CleaRly sth https://www.bbc.com/travel, HOW to have B@d words")

In [None]:
def get_followers_count(author_id, df_users):
    """
    input:
        author_id: self explanatory!
        df_users, dataframe storing users information
    output:
        integer, count of followers
    """
    try:
        return df_users.loc[df_users['id']==author_id].iloc[0]['followers_count']
    except:
        return df_users['followers_count'].median()

# get_followers_count(4704724720, users)

In [None]:
def clean_text(text_string):
    """
    input: 
        text string
    output: 
        clean string,
            stopwords removed
            punctuations removed
            urls removed
            convert to lower case characters
    """

    stopword = stopwords.words('english')
    punct = list(string.punctuation) + ['...']
    
    text_string1 = text_string.replace('\\n', '')
    text_string2 = text_string1.replace('\\n', '')

    
    text_string3 = remove_emoji(text_string2)
    text_string4 = tweet_urls_removed(text_string3)
    text_string5 = tweet_remove_mention(text_string4)
    
    
    sentence0 = re.sub(r'[^\w\s]', '', text_string5)
    sentence1 = [word for word in nltk.word_tokenize(sentence0) if word not in punct and word not in stopword]
    sentence2 = [Word(word).lemmatize().lower() for word in sentence1]
    
    return " ".join(sentence2)


clean_text("This, is #!  ‚ù§Ô∏è‚Äçüî•‚ù§Ô∏è‚Äçüî•  CleaRly sth https://www.bbc.com/travel, HOW to have B@d words")

In [None]:
def get_id(text_string):
    """
    input:
        text_string
    output:
        id in following string "[{???, 'id':123456789}]" (not exactly the same of course)
    """
    return int(json.loads(text_string.replace("'", '"'))[0]['id'])

