In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Load data as dataframe

In [None]:
def get_df(paths, dtype):
    """
    input:
        global paths to csv files
    output:
        dataframe
    """
             
    return pd.concat([pd.read_csv(path, dtype=dtype) for path in sorted(paths)], ignore_index=True) 

In [None]:
def get_df_from_feather(paths):
    """
    input:
        global paths to feather files
    output:
        dataframe
    """
    return pd.concat([pd.read_feather(path) for path in sorted(paths)])

In [None]:
def split_dataframe(df, chunk_size = 10000): 
    list_of_chunks = []
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        list_of_chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return list_of_chunks

In [None]:
def get_in_between(df, start, end):
    """
    *** 
    input:
        df: dataframe of tweets/replies/quotes
        start: starting day : data frame, filter rows by column used_at_time value
    """
    
    df['date'] = df['created_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ'))

    if start < end:
        return df.loc[(df['date'] < end) & (df['date'] > start)]
    else:
        return df.loc[(df['date'] < end) | (df['date'] > start)]





In [None]:
def get_sample_df(paths, pct, dtype):
    """
    input:
        paths = global paths to csv files
        pct = percentage of dataframe
    output:
        sample dataframe, for each csv file it takes pct% of the rows
    """
    sample_df_list = []
    for path in paths:
        sample_df_list.append(pd.read_csv(path, dtype=dtype).sample(frac = pct))
        
    return pd.concat(sample_df_list, ignore_index=True)

In [None]:
def get_sample_reply_quote(sample_df, df_conversation):
    """
    input:
        sample_df: sample_tweet
        df_conversation: replies or quotes dataframe
    output:
        dataframe, replies or quotes of the sample tweets
    """
    conversation_list = []
    print(len(sample_df))
    for cnt, conv_id in enumerate(sample_df['conversation_id']):
        conversation_list.append(get_conversation(conv_id, df_conversation))
        if cnt % 1000 == 0:
            print(cnt/len(sample_df))

    return pd.concat(conversation_list, ignore_index=True)

In [None]:
def clean_users(df_input):
    """
    input:
        df_input: dataframe storing users information
    output:
        dataframe with not duplicates for user, ie. only has one row per users
    """          
    return df_input.drop_duplicates(subset="id", keep="last")

In [None]:
def get_conversation(cid, df):
    """
    input:
        cid: conversation id
    output:
        subdataframe, containing conversation of the specific tweet
    """

    return df.loc[df['reference_tweet_id'] == cid]


In [None]:
def get_tweet(ref_id, df_tweets):
    dtemp = df_tweets.loc[df_tweets['conversation_id']==ref_id]
    if len(dtemp) == 1:
        return dtemp.iloc[0]['clean_text']
    else:
        []

In [None]:
def get_df_subset(df, conversation_list):
    return df.loc[df['conversation_id'].isin(conversation_list)]

In [None]:
# def add_extended_features(df):
#     """
#     input:
#         df: dataframe of tweets, of sample of tweets, scored or unscored
        
#     output:
#         dataframe containing extended features
#     """ 
    
#     conversation_list = list(df['conversation_id'])
#     extended_tweets_cols = get_df_subset(EXTENDED_TWEETS, conversation_list)[EXTEXDED_COLS]
    
#     return pd.merge(df, extended_tweets_cols, on='conversation_id')

In [26]:
def add_extended_features(df):
    """
    input:
        df: dataframe of tweets, of sample of tweets, scored or unscored
        
    output:
        dataframe containing extended features
    """ 
    
    conversation_list = [int(elm) for elm in list(df['conversation_id'])]
    extended_tweets_cols = get_df_subset(EXTENDED_TWEETS, conversation_list)[EXTEXDED_COLS]
    
    return pd.merge(df, extended_tweets_cols, on='conversation_id')