In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Load data as dataframe

In [None]:
def get_df(paths, dtype):
    """
    input:
        global paths to csv files
    output:
        dataframe
    """
             
    return pd.concat([pd.read_csv(path, dtype=dtype) for path in sorted(paths)], ignore_index=True) 

In [None]:
def get_df_from_feather(paths):
    """
    input:
        global paths to feather files
    output:
        dataframe
    """
    return pd.concat([pd.read_feather(path) for path in sorted(paths)])

In [None]:
def split_dataframe(df, chunk_size = 10000): 
    list_of_chunks = []
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        list_of_chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return list_of_chunks

In [None]:
def get_in_between(df, start, end):
    """
    *** 
    input:
        df: dataframe of tweets/replies/quotes
        start: starting day : data frame, filter rows by column used_at_time value
    """
    
    df['date'] = df['created_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))

    if start < end:
        return df.loc[(df['date'] < end) & (df['date'] > start)]
    else:
        return df.loc[(df['date'] < end) | (df['date'] > start)]





In [None]:
def get_sample_df(paths, pct, dtype):
    """
    input:
        paths = global paths to csv files
        pct = percentage of dataframe
    output:
        sample dataframe, for each csv file it takes pct% of the rows
    """
    sample_df_list = []
    for path in paths:
        sample_df_list.append(pd.read_csv(path, dtype=dtype).sample(frac = pct))
        
    return pd.concat(sample_df_list, ignore_index=True)

In [None]:
def get_sample_reply_quote(sample_df, df_conversation):
    """
    input:
        sample_df: sample_tweet
        df_conversation: replies or quotes dataframe
    output:
        dataframe, replies or quotes of the sample tweets
    """
    conversation_list = []
    print(len(sample_df))
    for cnt, conv_id in enumerate(sample_df['conversation_id']):
        conversation_list.append(get_conversation(conv_id, df_conversation))
        if cnt % 1000 == 0:
            print(cnt/len(sample_df))

    return pd.concat(conversation_list, ignore_index=True)

In [None]:
def clean_users(df_input):
    """
    input:
        df_input: dataframe storing users information
    output:
        dataframe with not duplicates for user, ie. only has one row per users
    """          
    return df_input.drop_duplicates(subset="id", keep="last")

In [None]:
def get_conversation(cid, df):
    """
    input:
        cid: conversation id
    output:
        subdataframe, containing conversation of the specific tweet
    """

    return df.loc[df['reference_tweet_id'] == cid]


In [None]:
def get_tweet(ref_id, df_tweets):
    dtemp = df_tweets.loc[df_tweets['conversation_id']==ref_id]
    if len(dtemp) == 1:
        return dtemp.iloc[0]['clean_text']
    else:
        []

In [None]:
# def remove_unknown_users_from_tweets(df_tweets, df_clean_users):
#     """
#     input:
#         df_tweets: dataframe of tweets
#         df_clean_users: dataframe of clean users,i.e, dataframe of users, each user one row.
#     output:
#         dataframe of tweets, the tweets with no author ID has been removed.
#     """
#     unwanted_author_ids = []
#     for author_id in df_tweets['author_id'].unique():
#         if len(df_clean_users.loc[df_clean_users['id']==author_id]) == 0:
#             unwanted_author_ids.append(df_tweets.loc[df_tweets['author_id']==author_id].index)
#     flatten_ids = [item for sublist in unwanted_author_ids for item in sublist]        
#     return df_tweets.drop(flatten_ids)
            

In [None]:
# def remove_unknown_users_conversation(df_tweets, df_conversation, df_users):
#     """
#     input:
#         df_tweets = loaded tweets, containing tweets of unknown users
#         df_users =  dataframe, containting users informations
#         df_conversation: dataframe of replies or quotes that contains replies/quotes to unknown user
#     output:
#         clean conversation dataframe, i.e replies and quotes of a tweet with known author
#     """
#     unknown_users = list(set(df_tweets['author_id'].unique()) - set(df_users['id'].unique()))
#     unvanted_idx = []
#     for user_id in unknown_users:
#          unvanted_idx.append(df_tweets.loc[df_tweets['author_id']==user_id].index)
#         for 

In [None]:
def get_semantic_polarity(text_string):
    """
    input: 
        text_string
    output:
        semantic_score
    """
    blob = TextBlob(text_string)
    score = blob.polarity
    if score > 0:
        return 'positive'
    elif score == 0:
        return 'neutral'
    elif score < 0:
        return 'negative'
    
    
get_semantic_polarity('This is What I try here, amazingly boring')

In [None]:
def get_all_conversation(cid, df):
    all_replies =  df.loc[df['conversation_id'] == cid]
    return all_replies



In [None]:
# TODO: you may need to delete this function
def get_replies_referenced_tweet_id(input_string):
    return int(ast.literal_eval(input_string)[0]['id'])



In [None]:
def get_count_reply_like_quote_retweet(df_row):
    """
    input:
        df_row: a row from tweets dataframe
    output:
        integer, 
    """
    return int(df_row.retweet_count) + int(df_row.like_count) + int(df_row.quote_count) + int(df_row.reply_count)



In [None]:
def add_polarity(df):
    try:
        return df['clean_text'].apply(lambda x: get_semantic_polarity(x))
    except:
        return print('Error: Column "clean_text" does not exist')
    
    

In [None]:
def add_count_reaction(df):
    try:
        return df.apply(lambda x: x.like_count + x.quote_count + x.retweet_count + x.all_reply_count, axis=1)
    except: print('Error: Column "all_reply_count" does not exist')
    
    

In [None]:
def add_emoji_group(df):
    return df['text'].apply(lambda x: extract_emoji_group(x))



In [None]:
def add_count_all_reply(df, df_replies):
    return df['conversation_id'].apply(lambda x: len(get_all_reply(x, df_replies)))

