In [48]:
# notebook dependencies 
import os # for caching purposeses
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# regular expression import
import re

# JSON import
import json

# importing BeautifulSoup for parsing HTML/XTML
from bs4 import BeautifulSoup

# request module for connecting to APIs
from requests import get

# uni-code library
import unicodedata

# natural language toolkit library/modules
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from pprint import pprint

In [49]:
# obtain subreddit text data 

def get_reddit_stress():
    # cached filename to look for
    filename = "stress.csv"

    # search for filename in local/OS directory
    if os.path.isfile(filename):

        # if file and filename exists, then return csv as Pandas df
        # future iteration: consider using relative path 
        # this may help to prevent referencing deleted/incorrect files
        df = pd.read_csv(filename, index_col = "social_timestamp")

        # print df shape
        print(f'dataframe shape: {df.shape}')

        # return the dataframe
        return df

    # if file/csv can't be found in OS directory, then access the data and cache it as a csv locally
    else:

        url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/stress.csv"
        # read text data as csv and convert to pandas dataframe
        df = pd.read_csv(url)

        # let's move forward with just the following columns/features
        df = df[[
            "label", 
            "post_id",
            "subreddit",                                 
            "sentence_range",              
            "text",                        
            "id",              
            "confidence",            
            "social_timestamp",           
            "social_karma",                
            "syntax_ari",
            "sentiment",
            "social_upvote_ratio",
            "social_num_comments"
            ]]

        # timestamp appears to be in "epoch seconds format"
        df["social_timestamp"] = pd.to_datetime(df['social_timestamp'], unit = 's')

        # sort and set data as index
        df.set_index('social_timestamp', inplace = True)

        # cache the data for easier/quicker reference
        df.to_csv("stress.csv")

        # print the shape
        print(f'dataframe shape: {df.shape}')

        # return the dataframe
        return df

In [50]:
# testing out the created function 

df = get_reddit_stress()
df.head()

dataframe shape: (2838, 12)


Unnamed: 0_level_0,label,post_id,subreddit,sentence_range,text,id,confidence,social_karma,syntax_ari,sentiment,social_upvote_ratio,social_num_comments
social_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-03-21 06:39:13,1,8601tu,ptsd,"(15, 20)","He said he had not felt that way before, sugge...",33181,0.8,5,1.806818,-0.002742,0.86,1
2018-05-22 17:23:37,0,8lbrx9,assistance,"(0, 5)","Hey there r/assistance, Not sure if this is th...",2606,1.0,4,9.429737,0.292857,0.65,2
2018-09-03 00:46:45,1,9ch1zh,ptsd,"(15, 20)",My mom then hit me with the newspaper and it s...,38816,0.8,2,7.769821,0.011894,0.67,0
2018-01-20 06:25:55,1,7rorpp,relationships,"[5, 10]","until i met my new boyfriend, he is amazing, h...",239,0.6,0,2.667798,0.141671,0.5,5
2018-10-17 20:43:25,1,9p2gbc,survivorsofabuse,"[0, 5]",October is Domestic Violence Awareness Month a...,1421,0.8,24,7.554238,-0.204167,1.0,1


In [51]:
# df info

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2838 entries, 2018-03-21 06:39:13 to 2017-03-07 17:58:36
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   label                2838 non-null   int64  
 1   post_id              2838 non-null   object 
 2   subreddit            2838 non-null   object 
 3   sentence_range       2838 non-null   object 
 4   text                 2838 non-null   object 
 5   id                   2838 non-null   int64  
 6   confidence           2838 non-null   float64
 7   social_karma         2838 non-null   int64  
 8   syntax_ari           2838 non-null   float64
 9   sentiment            2838 non-null   float64
 10  social_upvote_ratio  2838 non-null   float64
 11  social_num_comments  2838 non-null   int64  
dtypes: float64(4), int64(4), object(4)
memory usage: 288.2+ KB


In [52]:
# df shape

df.shape

(2838, 12)

In [53]:
# column names to list

df.columns.tolist()

['label',
 'post_id',
 'subreddit',
 'sentence_range',
 'text',
 'id',
 'confidence',
 'social_karma',
 'syntax_ari',
 'sentiment',
 'social_upvote_ratio',
 'social_num_comments']

In [54]:
# what is the percentage of missing values by ea. column/feature?

with pd.option_context("display.max_rows", False):
    print(df.isnull().mean()) # no features contain missing values

label                  0.0
post_id                0.0
subreddit              0.0
sentence_range         0.0
text                   0.0
id                     0.0
confidence             0.0
social_karma           0.0
syntax_ari             0.0
sentiment              0.0
social_upvote_ratio    0.0
social_num_comments    0.0
dtype: float64


In [55]:
# # let's move forward with the following columns/features

# df = df[[
#     "label", 
#     "post_id",
#     "subreddit",                                 
#     "sentence_range",              
#     "text",                        
#     "id",              
#     "confidence",            
#     "social_timestamp",           
#     "social_karma",                
#     "syntax_ari",
#     "sentiment",
#     "social_upvote_ratio",
#     "social_num_comments"
#     ]]

# df.shape 

In [56]:
# cache text data as csv for easier accessing

# df.to_csv("stress.csv")

In [57]:
# check the head

# df = pd.read_csv("stress.csv", index_col = True)

In [58]:
# let's look at the "social_timestamp" feature 
# timestamp appears to be in "epoch seconds format"

# df["social_timestamp"] = pd.to_datetime(df['social_timestamp'], unit = 's')
# df["social_timestamp"].head() # checks out!

In [59]:
# check the df info

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2838 entries, 2018-03-21 06:39:13 to 2017-03-07 17:58:36
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   label                2838 non-null   int64  
 1   post_id              2838 non-null   object 
 2   subreddit            2838 non-null   object 
 3   sentence_range       2838 non-null   object 
 4   text                 2838 non-null   object 
 5   id                   2838 non-null   int64  
 6   confidence           2838 non-null   float64
 7   social_karma         2838 non-null   int64  
 8   syntax_ari           2838 non-null   float64
 9   sentiment            2838 non-null   float64
 10  social_upvote_ratio  2838 non-null   float64
 11  social_num_comments  2838 non-null   int64  
dtypes: float64(4), int64(4), object(4)
memory usage: 288.2+ KB


In [60]:
# set timestamp as index

# df = df.set_index(pd.DatetimeIndex(df['social_timestamp']))
# df.head()

In [61]:
# check the head

df.head()

Unnamed: 0_level_0,label,post_id,subreddit,sentence_range,text,id,confidence,social_karma,syntax_ari,sentiment,social_upvote_ratio,social_num_comments
social_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-03-21 06:39:13,1,8601tu,ptsd,"(15, 20)","He said he had not felt that way before, sugge...",33181,0.8,5,1.806818,-0.002742,0.86,1
2018-05-22 17:23:37,0,8lbrx9,assistance,"(0, 5)","Hey there r/assistance, Not sure if this is th...",2606,1.0,4,9.429737,0.292857,0.65,2
2018-09-03 00:46:45,1,9ch1zh,ptsd,"(15, 20)",My mom then hit me with the newspaper and it s...,38816,0.8,2,7.769821,0.011894,0.67,0
2018-01-20 06:25:55,1,7rorpp,relationships,"[5, 10]","until i met my new boyfriend, he is amazing, h...",239,0.6,0,2.667798,0.141671,0.5,5
2018-10-17 20:43:25,1,9p2gbc,survivorsofabuse,"[0, 5]",October is Domestic Violence Awareness Month a...,1421,0.8,24,7.554238,-0.204167,1.0,1


In [None]:
# data familiarization/exploration 
# let's look at ea. individual feature/column and their respective values

