In [42]:
# necessary imports
import os
import pandas as pd
import numpy as np
import glob
import plotly.express as px
from scipy import signal
from kolzur_filter import kz_filter, kzft
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")

In [2]:
# set to *.csv to process all
path_to_csv = '../data/BTW17_Twitter/hashtags/*.csv'
file_list = glob.glob(path_to_csv)

df = pd.DataFrame()

# concatenate dataframes
for index in tqdm(range(len(file_list))):
    file = file_list[index]
    
    # read file to df if first file
    if index==0:
        df = pd.read_csv(file)
        df.drop("Unnamed: 0", axis=1, inplace=True)
    
    # append to df if not first file
    else:
        df2 = pd.read_csv(file)
        df2.drop("Unnamed: 0", axis=1, inplace=True)
        df = df.append(df2)

df = df.groupby(["date", "hashtag"], as_index=False).sum("count")
df.describe(include="all")

HBox(children=(FloatProgress(value=0.0, max=1308.0), HTML(value='')))




Unnamed: 0,date,hashtag,count
count,99167,99167,99167.0
unique,120,30595,
top,2017-09-24,berlin,
freq,1386,120,
mean,,,10.305565
std,,,84.4949
min,,,1.0
25%,,,1.0
50%,,,1.0
75%,,,4.0


In [86]:
# plot top 25 hashtags
top25 = df[["hashtag","count"]].groupby("hashtag", as_index=False).sum("count").nlargest(columns="count", n=25)
df.sort_values(by="date", inplace=True)
fig1 = px.line(df[df["hashtag"].isin(top25["hashtag"])], x="date", y="count", color="hashtag", title="top25 hashtags",
              template="simple_white", color_discrete_sequence=px.colors.qualitative.Antique)
fig1.show()

In [118]:
def plot_peak_detection(hashtag, k):
    wavelets = df[df["hashtag"]==hashtag][["date", "count"]]
    filtered_wavelets = [0] * len(wavelets)
    half_k = int(k/2)
    filtered_wavelets[half_k:-half_k] = kz_filter(wavelets["count"].to_numpy(), k, 1)
    wavelets["filtered_count"] = filtered_wavelets
    results_prom = []
    for i in range(1,11):
        peakind_loop = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/i))
        prominences = signal.peak_prominences(filtered_wavelets, peakind_loop)
        results_prom.append(prominences[0].mean())
        
    id_max_prom = results_prom.index(max(results_prom)) + 1
    peakind = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/id_max_prom))
    fig = px.line(wavelets, x="date", y=["count", "filtered_count"], title=hashtag,
                  template="simple_white", color_discrete_sequence=px.colors.qualitative.Antique)
    
    for item in peakind:
        peak = wavelets["date"].tolist()[item]
        fig.add_vrect(x0=str(datetime.strptime(peak, '%Y-%m-%d').date() - timedelta(days=3)),
                      x1=str(datetime.strptime(peak, '%Y-%m-%d').date() + timedelta(days=3)),
                      line_width=0,
                      fillcolor="grey",
                      opacity=0.2)
    print(f"\nmean peak prom: {prominences[0].mean()}")
    fig.show()

In [119]:
for hashtag in top25["hashtag"]:
    plot_peak_detection(hashtag, 7)


mean peak prom: 70.79591836734699



mean peak prom: 76.17142857142854



mean peak prom: 68.3035714285714



mean peak prom: 3.3214285714285694



mean peak prom: 42.10204081632653



mean peak prom: 16.66666666666666



mean peak prom: 0.0



mean peak prom: 428.5714285714285



mean peak prom: 16.142857142857146



mean peak prom: 272.39285714285717



mean peak prom: 5.163265306122449



mean peak prom: 0.0



mean peak prom: 1.2448979591836735



mean peak prom: 17.571428571428573



mean peak prom: 2.542857142857143



mean peak prom: 3.1190476190476186



mean peak prom: 6.3265306122448965



mean peak prom: 29.61904761904762



mean peak prom: 15.938775510204081



mean peak prom: 36.38775510204081



mean peak prom: 4.057142857142856



mean peak prom: 15.57142857142857



mean peak prom: 0.0



mean peak prom: 23.71428571428571



mean peak prom: 3.053571428571428


In [149]:
def peak_detection(hashtag, k):
    wavelets = df[df["hashtag"]==hashtag][["date", "count"]]
    filtered_wavelets = [0] * len(wavelets)
    half_k = int(k/2)
    filtered_wavelets[half_k:-half_k] = kz_filter(wavelets["count"].to_numpy(), k, 1)
    wavelets["filtered_count"] = filtered_wavelets
    results_prom = []
    for i in range(1,11):
        peakind_loop = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/i))
        prominences = signal.peak_prominences(filtered_wavelets, peakind_loop)
        results_prom.append(prominences[0].mean())
        
    id_max_prom = results_prom.index(max(results_prom)) + 1
    peakind = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/id_max_prom))
    
    return(peakind)

In [264]:
peak_df = pd.DataFrame(columns=["peak", "hashtag"])
hashtag_list = df["hashtag"].unique().tolist()

# remove incomplete time series
#num_days = df["date"].nunique()/2 # min 60 days
#complete_hashtags = []
#for index in tqdm(range(len(hashtag_list))):
#    hashtag = hashtag_list[index]
#    if df[df["hashtag"]==hashtag]["date"].nunique() >= num_days:
#        complete_hashtags.append(hashtag)
#    df_clean = df[df["hashtag"].isin(complete_hashtags)]
    
# get peak indices
for index in tqdm(range(len(complete_hashtags))):
    hashtag = complete_hashtags[index]
    results = peak_detection(hashtag, 7)
    if index == 1:
        peak_df["peak"] = pd.Series(results)        
        peak_df["hashtag"] = hashtag
    else:
        for item in results:
            peak_df = peak_df.append({"peak": item, "hashtag": hashtag}, ignore_index=True)
            
peak_df.dropna(inplace=True)

HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))




In [266]:
# save to csv
path_file = '../data/BTW17_Twitter/peaks/peaks.csv'
peak_df.to_csv(path_file)