# Reddit User Vizualizer

In [None]:
# Searches Reddit for a single user and optionally keywords using the PushShift API via the python library psaw

# Designed to work with in "Appmode": click the Appmode button in the toolbar, or replace 'notebooks' with 'apps' this notebook's URL

# Appmode documentation: https://github.com/oschuett/appmode
# PSAW Documentation: https://github.com/dmarx/psaw
# PushShift API Reference: https://pushshift.io/api-parameters/

In [None]:
## Default Configuration
# Aggregate by month. 'D' for day.
import datetime as dt

agg_level = 'M'
default_chart_height = 800
default_chart_width = 1200
score_bubble_height=1600
maxVisibleRows = 25
default_min_date = dt.date(2019, 1, 1)
default_max_date = dt.date(2020, 1, 1)
default_limit = 10000
author=''

In [None]:

# Imports and options setting
from collections import OrderedDict
from urllib.parse import urlparse

# pandas, numpy
import numpy as np
import pandas as pd

# psaw
from psaw import PushshiftAPI
api = PushshiftAPI()

# plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import express as px

# qgrid
import qgrid
qgrid.enable()

# bokeh
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure,show
from bokeh.models import ColumnDataSource, OpenURL, TapTool, HoverTool, Jitter, Panel, Tabs
from bokeh.transform import factor_cmap, factor_mark, jitter
from bokeh.models.ranges import FactorRange
import bokeh.resources

output_notebook(bokeh.resources.INLINE,verbose=False, hide_banner=True)

# ipywidgets
import ipywidgets as widgets
from ipywidgets import interactive, interact, HBox, Layout,VBox, Select, interact_manual, Output, Tab
from IPython.display import FileLink, FileLinks
from IPython.core.display import display, HTML, clear_output


In [None]:
# Change default display
display(HTML("<style>.container { width:100% !important; } .header {padding-top:0px !important;} #notebook {padding-top:0px !important;} #notebook-container {padding:0px !important;}</style>"))
#display(HTML('<script> var x = document.getElementById("header");x.remove(x.selectedIndex);'))
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df = pd.DataFrame()

def score_by_subreddit_and_time(df,height=default_chart_height,width=default_chart_width):
    df['abs_score'] = np.abs(df['score'])
    df['is_negative_score'] = (df['score']<0)
    return(px.scatter(df,x='created_utc',y='subreddit',color='post_type',size='abs_score',hover_name='text',symbol='is_negative_score',width=chart_width,height=chart_height))

def link_domains_reporting_over_time (df,height=default_chart_height,width=default_chart_width):
    df['domain'] = None
    df.loc[df['url'].notnull(),'domain'] = [urlparse(i).netloc for i in df[df['url'].notnull()]['url'].values]
    df['domain'] = df['domain'].str.replace('^www.','')
    df = df.merge(bias_df,on='domain',how='left')
    df['reporting'] = df['reporting'].fillna('UNKNOWN')
    df['day'] = df['created_utc'].dt.floor('D')
    df['month'] = df['created_utc'].dt.to_period('M').dt.to_timestamp()
    plot_data = df.groupby(['day','reporting']).agg(len).reset_index()
    plot_data = plot_data[['day','reporting','score']]
    plot_data.columns = ['day','reporting','count']
    return(px.bar(plot_data,x='day',y='count',color='reporting',width=chart_width,height=chart_height))

def plot_karma_by_subreddit(times, lg=False,height=default_chart_height,width=default_chart_width):
    subs = times.subreddit.value_counts().head(10).index.tolist()
    times = times[times.subreddit.isin(subs)].copy()
    if lg:
        times['log_score'] = np.sign(times.score)*np.log2(np.abs(times.score)+1)
        return px.violin(times, y="log_score", x="subreddit", color='post_type', box=True, points="all",width=width,height=height)
    else:
        return px.violin(times, y="score", x="subreddit", color='post_type', box=True, points="all",width=width,height=height)


def author_to_karma_chart(data,width=default_chart_width,height=default_chart_height):
    df1 = data[data['post_type']=='comment']
    df1 = df1.sort_values('created_utc').reset_index()
    df2 = data[data['post_type']=='submission']
    df2 = df2.sort_values('created_utc').reset_index()
    df1['comment_score'] = df1['score'].cumsum()
    df2['submission_score'] = df2['score'].cumsum()

    times = pd.concat([df1,df2],ignore_index=True,sort=False)
    times = times.sort_values('created_utc').reset_index()

    times['submission_karma'] = times['submission_score']
    times['comment_karma'] = times['comment_score']
    times = times.fillna(0)
    times['karma'] = times['comment_karma']
    times.loc[times['post_type']=='submission','karma'] = times.loc[times['post_type']=='submission','submission_karma']
    return(px.line(times,width=width,height=height,x='created_utc',y='karma',color='post_type'))

def links_over_time (times,height=default_chart_height,width=default_chart_width, agg_level = agg_level):
    times = times.sort_values('created_utc').reset_index()
    times = times[times.domain!='']
    times.domain = times.domain.str.replace('/r/[a-zA-Z0-9_\-]+(.*)$','')
    times['D'] = times['created_utc'].dt.floor('D')
    times['M'] = times['created_utc'].dt.to_period('M').dt.to_timestamp()
    times['Y'] = times['created_utc'].dt.to_period('Y').dt.to_timestamp()
    plot_data = times.groupby([agg_level,'domain']).agg(len).reset_index()[[agg_level,'domain','created']]
    plot_data.columns = [agg_level,'domain','count']
    return(px.bar(plot_data,width=width,height=height,x=agg_level,y='count',color='domain'))

def author_subreddits_over_time(times,width=default_chart_width,height=default_chart_height,agg_level = agg_level):
    times = times.sort_values('created_utc').reset_index()
    times['D'] = times['created_utc'].dt.floor('D')
    times['M'] = times['created_utc'].dt.to_period('M').dt.to_timestamp()
    times['Y'] = times['created_utc'].dt.to_period('Y').dt.to_timestamp()
    plot_data = times.groupby([agg_level,'subreddit','post_type']).agg(len).reset_index()[[agg_level,'subreddit','post_type','created']]
    plot_data.columns = [agg_level,'subreddit','post_type','count']
    return(px.bar(plot_data,width=width,height=height,x=agg_level,y='count',color='subreddit',facet_row='post_type'))

def author_subreddits_bar(times,height=default_chart_height,width=default_chart_width):
    plot_data = times.groupby(['subreddit','post_type']).agg(len).reset_index()[['subreddit','post_type','created']]
    plot_data.columns = ['subreddit','post_type','count']
    plot_data.sort_values('count', ascending=False,inplace=True)
    return(px.bar(plot_data,y='count',color='subreddit',x='post_type',width=width,height=height))

def author_to_timeseries_chart(times,height=default_chart_height,width=default_chart_width):
    times['day']=times['created_utc'].dt.floor('D')
    plot_dat = times.groupby(['post_type','day']).agg(len).reset_index()
    plot_dat = plot_dat[['post_type','day','created']]
    plot_dat.columns = ['post_type','day','count']

    min_date = np.min(times['created_utc'].dt.floor('D'))
    max_date = np.max(times['created_utc'].dt.ceil('D'))
    all_dates = pd.DataFrame(pd.date_range(min_date,max_date,freq='D'))
    all_dates.columns = ['day']
    submission_dates = all_dates.copy()
    submission_dates['post_type'] = 'submission'
    comment_dates = all_dates.copy()
    comment_dates['post_type'] = 'comment'
    all_dates = pd.concat([submission_dates,comment_dates],ignore_index=True,sort=False)
    plot_dat = plot_dat.merge(all_dates,how='right',on=['day','post_type'])
    plot_dat = plot_dat.fillna(0)
    plot_dat = plot_dat.sort_values('day')
    return(px.line(plot_dat,x='day',y='count',color='post_type',title='Number of Comments and Submissons Daily',width=width,height=height))

def author_time_of_day(times,width=default_chart_width,height=default_chart_height):
    times['hr_est'] = (times.created_utc.astype(str).str[11:13].astype(int)-5) % 24

    plot_data = times.groupby(['hr_est','post_type']).agg(len).reset_index()[['hr_est','post_type','created']]
    plot_data.columns = ['hr_EST','post_type','count']
    return(px.bar(plot_data,width=width,height=height,x='hr_EST',y='count',facet_row='post_type'))

def author_day_of_week(times,width=default_chart_width,height=default_chart_height):
    times['weekday'] = times.created_utc.dt.weekday
    times = times.sort_values('weekday')
    times['weekday'] = times['weekday'].map({0:'Monday',1:'Tuesday',2:'Wednesday',3:'Thursday',4:'Friday',5:'Saturday',6:'Sunday'})
    plot_data = times.groupby(['weekday','post_type']).agg(len).reset_index()[['weekday','post_type','created']]
    plot_data.columns = ['weekday','post_type','count']
    return(px.bar(plot_data,width=width,height=height,x='weekday',y='count',facet_row='post_type',category_orders={'weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']}))

def reddit_score_timeseries_click_plot(df,height=score_bubble_height,width=default_chart_width):
    df['abs_score'] = np.abs(df['score'])+1
    df['abs_score'] = 4+2*np.log2(df['abs_score'])
    df['is_negative_score'] = (df['score']<0)

    yrange = FactorRange(factors = df['subreddit'].unique())
    MARKERS = ['circle','hex']
    SUBMISSION = ['comment', 'submission']

    p = figure(plot_width=width, plot_height=height,tools=['pan','tap','reset','box_zoom','hover'], active_drag="box_zoom",
         y_range=yrange, x_axis_type='datetime', title="Reddit Post Explorer")

    source = ColumnDataSource(data=dict(
        x = df['created_utc'],
        y = df['subreddit'],
        x_str = pd.to_datetime(df['created_utc']).astype('str'),
        subreddit = df['subreddit'],
        submission = df['post_type'],
        # "<a href=\""+df_to_show['permalink']+"\" target=\"_blank\">link</a>" &quot; target=&quot;_blank&quot;&gt;link&lt;/a&gt;
        permalink = df['permalink'].str.replace('^<a href=\"|\" target.*$| &quot.*$',''),
        size=df['abs_score'],
        title=df['title'],
        score=df['score'],
        text=df['text'],
        url = df['url'],
        color= df['is_negative_score'].map({True:'red',False:'blue'})
        ))

    p.scatter(x='x', y=jitter('y',width=0.6, range = yrange),  marker=factor_mark('submission', MARKERS, SUBMISSION), color = 'color', alpha=0.2,size='size',source=source)

    hover = p.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ('subreddit', '@subreddit'),
        ('score', '@score'),
        ('url', '@url'),
        ('title', '@title'),
        ('text', '@text'),
        ('date', '@x_str'),
        ('post_type','@submission')
    ])

    url = ""
    taptool = p.select(type=TapTool)
    taptool.callback = OpenURL(url='@permalink')
    return(p)


def author_to_karma_pies(df,width=default_chart_width,height=default_chart_height):    
    temp = df[df.post_type=='submission'].groupby('subreddit').agg(np.sum).reset_index()
    labels = temp['subreddit']
    values = temp['score']

    temp = df[df.post_type=='comment'].groupby('subreddit').agg(np.sum).reset_index()
    labels2 = temp['subreddit']
    values2 = temp['score']

    # Create subplots: use 'domain' type for Pie subplot
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

    fig.add_trace(go.Pie(labels=labels, values=values, name = 'Submission Score'),1,1)
    fig.add_trace(go.Pie(labels=labels2, values=values2, name = 'Comment Score'),1,2)

    fig.update_layout(
        width=width,
        height=height,
        title_text="Karma Sources (Subreddits)",
        # Add annotations in the center of the donut pies.
        annotations=[dict(text='Submissions', x=0.18, y=0.5, font_size=20, showarrow=False),
                     dict(text='Comments', x=0.82, y=0.5, font_size=20, showarrow=False)])
    return(fig)

def get_author_history(author,min_date = default_min_date,max_date = default_max_date):
    min_date = int(min_date.strftime("%s"))
    max_date = int(max_date.strftime("%s"))
    columns_to_get=['subreddit','created_utc','score','controversiality','url','permalink','title','domain','num_comments','num_crossposts','post_hint']
    temp = columns_to_get
    temp.extend(['selftext'])
    if len(text_search_widget.value ) > 0:
        submissions = list(api.search_submissions(author = author, after = min_date, before = max_date, q=text_search_widget.value.lower(), limit = query_limit.value, filter=temp))
    else:
        submissions = list(api.search_submissions(author = author,after = min_date, before = max_date, limit = query_limit.value, filter=temp))
    submissions_len=len(submissions)
    #print('Num Submissions:' + str(submissions_len))
    if submissions_len>0:
        submissions = pd.DataFrame([i.d_ for i in submissions])
        submissions['post_type']='submission'
    
    temp = columns_to_get
    temp.extend(['body','title'])
    
    if len(text_search_widget.value ) > 0:
        comments = list(api.search_comments(author = author,after = min_date, before = max_date, q=text_search_widget.value.lower(),  limit = query_limit.value, filter=temp))
    else:
        comments = list(api.search_comments(author = author,after = min_date, before = max_date,  limit = query_limit.value, filter=temp))
    comments_len = len(comments)
    
    #print('Num Comments:' + str(comments_len))
    if comments_len>0:
        comments = pd.DataFrame([i.d_ for i in comments])
        comments['post_type']='comment'
        
    if submissions_len>0:
        if comments_len>0:
            df = pd.concat([submissions, comments],ignore_index=True,sort=False)
            df['text'] = df['selftext'].combine_first(df['body'])
            df = df.drop(columns=['body','selftext'])
        else:
            df = submissions
            df['text'] = df['body']
            df = df.drop(columns=['body'])
    else:
        return(pd.DataFrame([],columns=['created_utc','permalink','score','subreddit','url','created','post_type','text']))
    df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')
    df['permalink'] = 'https://reddit.com'+df['permalink']
    df['title'] = df['title'].fillna('')
    df['domain'] = df['domain'].fillna('')
    df['post_hint'] = df['post_hint'].fillna('none')
    df['num_crossposts'] = df['num_crossposts'].fillna(0)
    df['num_comments'] = df['num_comments'].fillna(0)
    return(df)

def interactive_manual(f, *args, **kwargs):
    return interactive(f, {"manual":True, "auto_display": True}, **kwargs)

chart_height = widgets.IntText(value=default_chart_height,description='Chart Height')
chart_width = widgets.IntText(value=default_chart_width,description='Chart Width')


def update_charts(agg_level=agg_level,chart_height=chart_height.value,chart_width=chart_width.value):
    #global mygrid
    df = mygrid.get_changed_df()

    with out:
        clear_output()
        test = reddit_score_timeseries_click_plot(df,width=chart_width,height=chart_height)
        show(test)

    with out2:
        clear_output()
        test = author_subreddits_over_time(df,agg_level=agg_level,width=chart_width,height=chart_height)
        display(test)

    with out3:
        clear_output()
        test = author_time_of_day(df,width=chart_width,height=chart_height)
        display(test)

    with out4:
        clear_output()
        test = author_day_of_week(df,width=chart_width,height=chart_height)
        display(test)

    with out5:
        clear_output()
        test = author_to_karma_pies(df,width=chart_width,height=chart_height)
        display(test)

    with out6:
        clear_output()
        test = author_to_karma_chart(df,width=chart_width,height=chart_height)
        display(test)
        
    with out8:
        clear_output()
        test = links_over_time(df,agg_level=agg_level,width=chart_width,height=chart_height)
        display(test)    

        
out = Output()
out2 = Output()
out3 = Output()
out4 = Output()
out5 = Output()
out6 = Output()
out7 = Output()
out8 = Output()
        
file_link_output = Output()

t = Tab(layout = Layout(display='flex',
                    flex_flow='column',
                    width='80%'))
t.set_title(0,'Data Table')
t.set_title(1,'Subreddit Timeseries')
t.set_title(2,'Domain Timeseries')
t.set_title(3,'Post Chart')
t.set_title(4,'Time of Day')
t.set_title(5,'Days of Week')
t.set_title(6,'Karma Sources')
t.set_title(7,'Karma Growth')

t.children = [out7, out2, out8, out, out3, out4, out5, out6]

author_description = widgets.HTML("Enter author to search for:")
author_box = widgets.Text(value=author)

filter_button_Description = widgets.HTML("Apply table filters to charts")

b_filter_data = widgets.Button(description='Update Charts')
b_fetch_data = widgets.Button(description='Fetch Data for Author')

selct_date_agg_level = widgets.Dropdown(options=['D','M','Y'],value=agg_level)

text_search_widget = widgets.Text('')
query_limit = widgets.IntText(value=default_limit,description='Query Limit')


min_date = widgets.DatePicker(
    description='Start Date',
    value = default_min_date,
    disabled=False
)

max_date = widgets.DatePicker(
    description='End Date',
    value = default_max_date,
    disabled=False
)

param_pane = [widgets.HTML('<h3>Query PushShift</h3>'),
              author_description,
              author_box,
              query_limit,
              widgets.HTML('Search Text'),
              text_search_widget,
              min_date,
              max_date,
              b_fetch_data,
              widgets.HTML('<hr>'),
              widgets.HTML('<h3>Update Charts</h3>'),
              widgets.HTML('Date Aggregation Level'),
              selct_date_agg_level,
              chart_width,
              chart_height,
              filter_button_Description,
              b_filter_data,
              widgets.HTML('<hr><h3>File Download Links</h3>'),
             file_link_output]

mycolumns = widgets.HBox([widgets.VBox(param_pane,layout=Layout(display='flex',flex_flow='column',width='20%')),t])

def on_button_clicked(b):
    update_charts(agg_level = selct_date_agg_level.value,chart_height=chart_height.value,chart_width=chart_width.value)
    
def on_fetch_data_button(b):
    global df
    df = get_author_history(author = author_box.value,min_date = min_date.value, max_date = max_date.value)
    #global df_backup
    #df_backup = df
    global mygrid
    col_defs = {
    'text': {
        'width': 450,
    },
    'index': {
        'width': 10,
    },
            'permalink': {
        'width': 80,
    },
            'score': {
        'width': 80,
    },
        'url': {
        'width': 250,
    },
            'post_type': {
        'width': 100,
    }
    }
    with out7:
        clear_output()
        df_to_show = df
        df_to_show['permalink'] = "<a href=\""+df_to_show['permalink']+"\" target=\"_blank\">link</a>"
        df_to_show['url'] = "<a href=\""+df_to_show['url']+"\" target=\"_blank\">"+df_to_show['url'].str.replace('^https://www\.|^http://www\.','',regex=True)+"</a>"
        #df_to_show.set_index('created_utc', inplace=True)
        df_to_show = df_to_show.sort_values('created_utc')
        df_to_show['url'] = df_to_show['url'].fillna('')
        mygrid = qgrid.show_grid(df_to_show,column_definitions=col_defs,show_toolbar=False,grid_options={'forceFitColumns': False, 'editable':False, 'maxVisibleRows': maxVisibleRows, 'minVisibleRows': 8 })
        display(mygrid)
    update_charts(agg_level = selct_date_agg_level.value,chart_height=chart_height.value,chart_width=chart_width.value)
    with file_link_output:
        df.to_csv('reddit_user_data_for_'+author_box.value+'.csv')
        display(FileLink('reddit_user_data_for_'+author_box.value+'.csv') )
        #FileLink('.') #lists all downloadable files on server
        
b_fetch_data.on_click(on_fetch_data_button)
    
b_filter_data.on_click(on_button_clicked)

mycolumns
