In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


nltk.download('wordnet')
nltk.download('omw-1.4')

# Load data as dataframe

In [None]:
def get_df(paths):
    """
    input:
        global paths to csv files
    output:
        dataframe
    """
    df_list = []
    for path in paths:
        df_list.append(pd.read_csv(path))
             
    return pd.concat(df_list, ignore_index=True) 

In [None]:
def get_sample_df(paths, pct):
    """
    input:
        paths = global paths to csv files
        pct = percentage of dataframe
    output:
        sample dataframe, for each csv file it takes pct% of the rows
    """
    sample_df_list = []
    for path in paths:
        sample_df_list.append(pd.read_csv(path).sample(frac = pct))
        
    return pd.concat(sample_df_list, ignore_index=True)

In [None]:
def get_sample_reply_quote(sample_df, df_conversation):
    """
    input:
        sample_df: sample_tweet
        df_conversation: replies or quotes dataframe
    output:
        dataframe, replies or quotes of the sample tweets
    """
    conversation_list = []
    for conv_id in sample_df['conversation_id']:
        conversation_list.append(get_conversation(conv_id, df_conversation))

    return pd.concat(conversation_list, ignore_index=True)

In [None]:
def clean_users(df_input):
    """
    input:
        df_input: dataframe storing users information
    output:
        dataframe with not duplicates for user, ie. only has one row per users
        the row with max number of tweets is selected for the user
    """
    df = copy.deepcopy(df_input).rename_axis('index1').reset_index()
    df_list = []
    
    for user_id in df['id'].unique():
        
        if len(df.loc[df['id']==user_id]) == 1:
            df_list.append(df.loc[df['id']==user_id])
        
        if len(df.loc[df['id']==user_id]) > 1:
            user_df = df.loc[df['id']==user_id]
            max_tweet_idx =  user_df['tweet_count'].idxmax()
            df_list.append(df.loc[df['index1']==max_tweet_idx])
            
    return pd.concat(df_list, ignore_index=True)
            

In [None]:
def get_semantic_polarity(text_string):
    """
    input: 
        text_string
    output:
        semantic_score
    """
    blob = TextBlob(text_string)
    score = blob.polarity
    if score > 0:
        return 'positive'
    elif score == 0:
        return 'neutral'
    elif score < 0:
        return 'negative'
    
    
get_semantic_polarity('This is What I try here, amazingly boring')

# Reply / Quotes

In [None]:
def get_conversation(cid, df):
    """
    input:
        cid: conversation id
    output:
        
    """
    conversation = df.loc[df['referenced_tweets'].str.contains(f'{cid}')]
    return conversation



In [None]:
def get_all_conversation(cid, df):
    all_replies =  df.loc[df['conversation_id'] == cid]
    return all_replies



In [None]:
# TODO: you may need to delete this function
def get_replies_referenced_tweet_id(input_string):
    return int(ast.literal_eval(input_string)[0]['id'])



In [None]:
def get_count_reply_like_quote_retweet(df_row):
    """
    input:
        df_row: a row from tweets dataframe
    output:
        integer, 
    """
    return int(df_row.retweet_count) + int(df_row.like_count) + int(df_row.quote_count) + int(df_row.reply_count)



In [None]:
# def second_level_conversations

In [None]:
def add_engagement_score(df):
    try:
        return df.apply(lambda x: x.total_raction/ (x.followers_count + 1), axis=1)
    except: print('Error: Column "total_raction" does not exist')
    
    

In [None]:
def add_polarity(df):
    try:
        return df['clean_text'].apply(lambda x: get_semantic_polarity(x))
    except:
        return print('Error: Column "clean_text" does not exist')
    
    

In [None]:
def add_count_reaction(df):
    try:
        return df.apply(lambda x: x.like_count + x.quote_count + x.retweet_count + x.all_reply_count, axis=1)
    except: print('Error: Column "all_reply_count" does not exist')
    
    

In [None]:
def add_emoji_group(df):
    return df['text'].apply(lambda x: extract_emoji_group(x))



In [None]:
def add_count_all_reply(df, df_replies):
    return df['conversation_id'].apply(lambda x: len(get_all_reply(x, df_replies)))

