In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from psaw import PushshiftAPI
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from datetime import datetime


## Helper Functions

In [127]:
def print_sentiment_scores(sentence, verbose=True, analyzer=SentimentIntensityAnalyzer()):
    """
        Returns sentiment scores of a given piece of text based on 
        user input of sentiment analysis algorithm.
        --------
    """
    snt = analyzer.polarity_scores(sentence)
    
    if(verbose):
        print("{:-<40} {}".format(sentence, str(snt)))
    return(sentence, snt)


def top_tokens(df, vectorizer, independent_var, number_of_tokens=10):
    """
        Returns a bar chart of the most common tokens given a particular
        vectorizer implementation
        --------
    """
    vectorizer_df = pd.DataFrame(vectorizer.fit_transform(df[independent_var]).toarray(),
                                 columns=vectorizer.get_feature_names())

    top_tokens = pd.DataFrame(vectorizer_df.sum(
        axis=0).sort_values(ascending=False).head(number_of_tokens))
    top_tokens.columns = ['Token_Count']
    
    plt.figure()
    ax = sns.barplot(x='Token_Count', y=top_tokens.index, data=top_tokens)
    
    return top_tokens


## API Call

In [175]:
# my_client_id = 'RC_xEmLewxLSkQ'
# my_client_secret = 'gBXgSWOAN_u2bk2QZydOoz3j1J4'
# my_user_agent = 'BEG_Scraper_TEST'
sub_reddit = 'all'
kw_list = ["Self Driving Cars", "Artificial Intelligence", 
           "IOT", "Big Data"]

search_term = kw_list[2]
limit = 1000

# Set Date Ranges
start_day = 1
start_month = 7
start_year = 2018

stop_day = 31
stop_month = 10
stop_year = 2018

start_epoch = int(datetime(start_year, start_month, start_day).timestamp())
end_epoch = int(datetime(end_year, stop_month, stop_day).timestamp())

In [176]:
pull_count = 1
while start_epoch < end_epoch:
    print("API CALL: {} at {}/{} ".format(pull_count, stop_month, stop_day))
    pull_count +=1

    if __name__ == '__main__':
        try:
            print("ATTEMPTING TO RELOAD SAVED QUERY...")
            df = pd.read_pickle("./Data/reddit_%s_analysis_%s.pkl" % (search_term, end_epoch))

        except FileNotFoundError:
            print("Querying Reddit API...")

            api = PushshiftAPI()
            df = pd.DataFrame(list(api.search_submissions(after=start_epoch,
                                                          before=end_epoch,
                                                          q=search_term,
                                                          filter=['title', 'author'],
                                                          limit=limit)))
            df['created_utc'] = [datetime.utcfromtimestamp(element).strftime('%Y-%m-%d')
                                 for element in df['created_utc']]
            df = df.drop(['created', 'd_'], axis=1)
            df.to_pickle("./Data/reddit_%s_analysis_%s.pkl" % (search_term, end_epoch))  # Save final df locally
            
    # Workaround for pagination   
    stop_day = pd.to_datetime(df.loc[len(df)-1, ['created_utc']]).dt.day.item()
    stop_month = pd.to_datetime(df.loc[len(df)-1, ['created_utc']]).dt.month.item()
    stop_year = pd.to_datetime(df.loc[len(df)-1, ['created_utc']]).dt.year.item()
    end_epoch = int(datetime(stop_year, stop_month, stop_day).timestamp())
    
    # Run Sentiment Analysis
    sentiment_df = pd.DataFrame()
    for x in df.loc[:, 'title']:
        sentence, snt = print_sentiment_scores(x, verbose=False)
        temp = pd.DataFrame({'column1': [snt]})
        temp = pd.DataFrame(temp['column1'].values.tolist(), index=temp.index)
        temp['sentence'] = sentence
        sentiment_df = pd.concat([sentiment_df, temp], ignore_index=True)

    sentiment_df = sentiment_df[['sentence', 'compound', 'neg', 'neu', 'pos']]
    full_sentiment_df = df.join(sentiment_df)
    full_sentiment_df = full_sentiment_df.drop(['sentence'], axis=1)

    # Save full DF
    try:
        print('Fetching Final Reddit File...')
        final_df = pd.read_pickle("./Data/reddit_full_%s_%s.pkl" % (search_term, end_epoch))

    except FileNotFoundError:
        print('File not found, recreating...')
        full_df = full_sentiment_df.copy()

        final_df = full_df.groupby('created_utc').mean()
        final_df['count'] = full_df.groupby('created_utc').agg('count')['title']
        final_df['unique_authors'] = full_df.groupby('created_utc')['author'].nunique()
        final_df['posts_per_author'] = final_df['count'] / final_df['unique_authors']
        final_df.to_pickle("./Data/reddit_full_%s_%s.pkl" % (search_term, end_epoch))

final_df.head()

API CALL: 1 at 10/31 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 2 at 10/16 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 3 at 10/2 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 4 at 9/17 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 5 at 9/1 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 6 at 8/17 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 7 at 8/3 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 8 at 7/21 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...
API CALL: 9 at 7/6 
ATTEMPTING TO RELOAD SAVED QUERY...
Fetching Final Reddit File...


Unnamed: 0_level_0,compound,neg,neu,pos,count,unique_authors,posts_per_author
created_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-07-01,0.099925,0.006188,0.920094,0.073688,32,28,1.142857
2018-07-02,0.211303,0.014134,0.85603,0.129836,67,54,1.240741
2018-07-03,0.210961,0.020611,0.840403,0.139,72,54,1.333333
2018-07-04,0.241032,0.014378,0.850134,0.135488,82,53,1.54717
2018-07-05,0.170155,0.032701,0.839194,0.128104,67,54,1.240741
