In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Update dataframe

In [None]:
def add_hashtags(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are lists, each item contains corresponding hashtags in tweets
    """
    
    return df['text'].apply(lambda x: extract_hashtags(x))


In [None]:
def add_nbr_hashtags(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are number of hashtags in tweets
    """
    
    return df['text'].apply(lambda x: len(extract_hashtags(x)))


In [None]:
def add_exclamation_mark_count(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are number of exclamation marks in tweet
    """
    
    return df['text'].apply(lambda x: exclamaintion_mark_count(x))


In [None]:
def add_question_mark_count(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are number of question marks in tweet
    """
    
    return df['text'].apply(lambda x: question_mark_count(x))


In [None]:
def add_url_count(df): 
    """
    input: 
        df: dataframe of tweets
    output:
        panda series, items are number of URLs in corresponding tweets 
    """
    
    return df['text'].apply(lambda x: len(extract_urls(x)))


In [None]:
def add_upper_case_pct(df):
    """
    input:
        dataframe of tweet
    output
        panda series, items are percentage of uppercase characters in a tweet
    """
    
    return df['text'].apply(lambda x: upper_case_pct(x))


In [None]:
def add_uppercase_count(df):
    """
    input: 
        df: dataframe of tweets
    output:
        panda series, items are number of upper case words in corresponding tweets 
    """
    
    return df['text'].apply(lambda x: count_uppercase_words(x))



In [None]:
def add_mention_count(df):
    """
    input:
        df: tweet's dataframe
    output:
        panda series, items are number of mentions in a tweet
    """
    
    return df['text'].apply(lambda x: mention_count(x))


In [None]:
def add_emoji_count(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, counting number of emojis in each tweet
    """
    
    return df['text'].apply(lambda x: count_emojies(x))



In [None]:
def add_clean_text(df):
    """
    input:
        df: dataframe of tweets
    output:
        panda series, items are cleaned tweets
    """
    
    return df['text'].apply(lambda x: clean_text(x))



In [None]:
def add_followers_count(df, df_users):
    """
    input:
        df: tweet's dataframe
        df_users: dataframe, storiing users information
    output:
        panda series, each item is number of followers of tweeter users
        if there was no information about number of followers, then assigns the median of followers count
    """
    
    return df['author_id'].apply(lambda x: get_followers_count(x, df_users))

In [None]:
def add_engagement_score(df):
    """
    input:
        df: tweets/replies/quotes dataframe
    output:
        panda series, items are calculaed engagement scores
    """
    
    return (df['retweet_count'] + df['reply_count'] + df['like_count'] + df['quote_count'])/ (df['followers_count']+1)

In [None]:
def add_sentiment_score(df_input):
     """
    input:
        df_input: tweets/replies/quotes dataframe
    output:
        panda series, items are calculaed sentiment scores
    """
    return df_input['text'].apply(lambda x: sentiment_vader(sentiment_preprocessing(x)))