Let's see what we can discover about the overall sentiment of our Facebook posts.

In [None]:
import pandas as pd
import json
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import string

# quick and dirty sentiment analysis; more complex to come
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#from nltk.classify import NaiveBayesClassifier
#from nltk.corpus import subjectivity
#from nltk.sentiment import SentimentAnalyzer
#from nltk.sentiment.util import *
#from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer
#from nltk.stem.wordnet import wordnet, WordNetLemmatizer
#import re

In [None]:
def timestamp_sec_to_datetime(ts):
    assert int(ts) > 0, "must be nonnegative int"
    sts = str(ts)
    return datetime.utcfromtimestamp(int(sts[:10]))

In [None]:
# If you need to download the vader_lexicon

# nltk.download('vader_lexicon')

In [None]:
facebook_dir = "./facebook-YOURNAMEHERE/"

In [None]:
with open(facebook_dir + "posts/your_posts_1.json", "r") as f:
    posts = json.load(f)

First, we'll attempt a simple sentiment analysis.

Let's get all of one's Facebook posts from the list of JSON into a DataFrame.

In [None]:
posts_meta = defaultdict(int)
# rudimentary EDA: what keys are on each post?
for i in range(len(posts)):
    for k in posts[i].keys():
        posts_meta[k] +=1 

How many memories did you share? What is your memory-share-post-ratio?

In [None]:
total_posts = len(posts)
memories_shared = 0
# ASSUMPTION: This is always the form FB used.
memory_sentence = "shared a memory" 

for i in range(total_posts):
    if 'title' in posts[i].keys():
        if memory_sentence in posts[i]['title']:
            memories_shared += 1
#            print(posts[i])
            
memory_post_ratio = 100 * round(memories_shared / total_posts, 4)
            
print(f"Memories shared: {memories_shared} / {total_posts} = {memory_post_ratio}% of all posts")

In [None]:
# Quick & dirty sentiment analysis of all text posts
# A text post resides in ['data'][j][k] for some j
# for k in ('post', 'text', 'description')
# or, use 'description' for a photo caption
# ... for now, just pull 'post'. It's the most direct.

text_keys = ('post', 'description')
text_posts = []
text_timestamps = []
for i in range(len(posts)):
    if 'data' in posts[i].keys():
        for j in range(len(posts[i]['data'])):
            for k in text_keys:
                if k in posts[i]['data'][j]:
                    text_posts.append(posts[i]['data'][j][k])
                    text_timestamps.append(posts[i]['timestamp'])
#    if 'attachments' in posts[i].keys():
#        for j in range(len(posts[i]['attachments'])):

We'll do a rudimentary sentiment analysis on these text posts with VADER (which has been trained on social media: https://github.com/cjhutto/vaderSentiment).

We don't remove stopwords, lowercase, or otherwise clean the posts (besides removing \n), as each idiosyncrasy may have sentimental value.

In [None]:
# We'll do a rudimentary sentiment analysis on these text posts with NLTK.
sid = SentimentIntensityAnalyzer()

In [None]:
text_datetimes = [timestamp_sec_to_datetime(ts) for ts in text_timestamps]

In [None]:
text_posts_cleaned = []
for t in text_posts:
    text_posts_cleaned.append(t.replace('\n', ' '))

In [None]:
ss_dict_list = []
for i in range(len(text_posts_cleaned)): 
    # still mostly uncleaned, and less Pythonic, yes
#    print(sent)
    ss = sid.polarity_scores(text_posts_cleaned[i])
    ss['datetime'] = text_datetimes[i]
    ss['post'] = text_posts_cleaned[i]
#    for k in sorted(ss):
#        print(f"{k}: {ss[k]} ", end="")
    ss_dict_list.append(ss)

Collect average sentiment per month in each category and plot them all.

In [None]:
# Build a DataFrame with date and each parameter in ss_dict.
ss_dict_to_df = {}

for k in ss_dict_list[0].keys():
    ss_dict_to_df[k] = []

for ss in ss_dict_list: 
    for k in ss.keys():
        ss_dict_to_df[k].append(ss[k])

In [None]:
sentiment_df = pd.DataFrame.from_dict(ss_dict_to_df)

In [None]:
# For each month, take the average of each score.
# Group by year, then month. TODO Multi-index instead.
sentiment_df['year']  = list(map(lambda x: x.year,  sentiment_df['datetime']))
sentiment_df['month'] = list(map(lambda x: x.month, sentiment_df['datetime']))

In [None]:
fb_years = sorted(list(sentiment_df['year'].unique()))
sentiment_scores = ['neg', 'neu', 'pos', 'compound']

In [None]:
fb_post_sentiment_evolution = {}
times = []
for score in sentiment_scores:
    fb_post_sentiment_evolution[score] = []
    for y in fb_years:
        for m in range(1,13):
            val = sentiment_df[(sentiment_df['year']==y)
                             & (sentiment_df['month']==m)][score].mean()
            if not np.isnan(val): # only list actual numbers
                fb_post_sentiment_evolution[score].append(val)
                if score == 'pos': 
                    time = round(y + m/12, 3)    
                    # give an approx decimal value for the end of that month. 
                    times.append(time) # only do this once; hack

In [None]:
# Plot all of these averages.

plt.plot(times, fb_post_sentiment_evolution['pos'])
plt.plot(times, fb_post_sentiment_evolution['neg'])

In [None]:
# Now let's get box plots per year. Separate the sentiment scores into columns by year.
fb_post_sentiment_boxplots = {}
for score in sentiment_scores:
    fb_post_sentiment_boxplots[score] = []
    for y in fb_years:
        fb_post_sentiment_boxplots[score].append(list(sentiment_df[sentiment_df['year']==y][score]))

In [None]:
#fb_years
#fb_post_sentiment_boxplots
plt.boxplot(fb_post_sentiment_boxplots['pos'], labels=fb_years)
plt.title("Facebook post positive sentiment by year")
plt.show()

In [None]:
plt.boxplot(fb_post_sentiment_boxplots['neg'], labels=fb_years)
plt.title("Facebook post negative sentiment by year")
plt.show()

In [None]:
plt.boxplot(fb_post_sentiment_boxplots['neu'], labels=fb_years)
plt.title("Facebook post neutral sentiment by year")
plt.show()

In [None]:
plt.boxplot(fb_post_sentiment_boxplots['compound'], labels=fb_years)
plt.title("Facebook post compound sentiment by year")
plt.show()

Ideas for future analysis: 
    
    * Year breakdown of memories shared
    * Sentiment analysis on the text of the post, and any text on an attachment to the post.
    * Sentiment analysis on any image attached to the post.
    * Sentiment analysis on key frames of a video attached to the post.
    * Build overall counts for the types of titles on the posts:
        * regular post
        * post share
        * image share
        * video share
        * memory share
        * Twitter share
        * Instagram share
        * other share?
        * other kinds of posts?
    * Unsupervised topic modeling
    * Correlate / otherwise associate likes/reactions to posts to sentiment/topic?    