See the README for an explanation of how this code runs and functions.

Contact michaeldezube at gmail dot com with questions.

# Imports

In [None]:
import json
import re
import string

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn  # To improve the chart styling.
import wordtree

from IPython.display import display
from IPython.display import HTML
from IPython.display import Javascript
from wordcloud import STOPWORDS
import ipywidgets as widgets
from wordcloud import WordCloud

import table_connector

# Load the data from disk and set up the dataframes

In [None]:
%matplotlib inline
matplotlib.style.use('ggplot')
pd.set_option('display.max_colwidth', 1000)
table_connector.initialize()

fully_merged_messages_df, address_book_df = table_connector.get_cleaned_fully_merged_messages()
full_names = set(address_book_df.full_name)  # Handy set to check for misspellings later on.
fully_merged_messages_df.full_name.replace('nan nan nan', 'Unknown', inplace=True)

In [None]:
fully_merged_messages_df.reset_index(drop=True)

In [None]:
address_book_df

### Use `fully_merged_messages_df` and `address_book_df` for analysis, they contain all messages with columns for the sender and all contacts, respectively

# Visualize a word tree of texts exchanged with a specific contact

In [None]:
CONTACT_NAME = 'Mom'  # Freely change this value.
ROOT_WORD = 'feel'  # Freely change this value.

filtered_texts = fully_merged_messages_df[(fully_merged_messages_df.full_name == CONTACT_NAME)]
print 'I exchanged {0:,} texts with {1}'.format(filtered_texts.shape[0], CONTACT_NAME)

filtered_texts_i_sent = filtered_texts[filtered_texts.is_from_me == 1]
print 'I sent {0:,} texts to {1}'.format(filtered_texts_i_sent.shape[0], CONTACT_NAME)

filtered_texts_i_received = filtered_texts[filtered_texts.is_from_me == 0]
print 'I received {0:,} texts from {1}'.format(filtered_texts_i_received.shape[0], CONTACT_NAME)

# Returns JSON in the format needed for word trees.
def get_google_json_for_dataframe(df):
    array_for_json = [[text[1]] for text in df.text.iteritems()]
    array_for_json.insert(0, [['Phrases']])
    return json.dumps(array_for_json)

# You can change tree_type to 'double', 'suffix', or 'prefix', see 
# https://developers.google.com/chart/interactive/docs/gallery/wordtree#configuration-options for
# details
#
# You can also change "filtered_texts" to "filtered_texts_i_sent" or "filtered_texts_i_received".

# Note this requires an internet connection to load Google's JS library.
HTML(wordtree.get_word_tree_html(get_google_json_for_dataframe(filtered_texts),
                                 ROOT_WORD.lower(), lowercase=True, tree_type='double'))

# Table and graph of who you text the most

In [None]:
# Note "Unknown" means the number was not found in your address book.

def get_message_counts(dataframe):
    return pd.Series({'Texts sent': dataframe[dataframe.is_from_me == 1].shape[0],
                      'Texts received': dataframe[dataframe.is_from_me == 0].shape[0],
                      'Texts exchanged': dataframe.shape[0]})
messages_grouped = fully_merged_messages_df.groupby('full_name').apply(get_message_counts)
messages_grouped = messages_grouped.sort_values(by='Texts exchanged', ascending=False)

widgets.interact(messages_grouped.head,
                 n=widgets.IntSlider(min=5, max=50, step=1, value=5, continuous_update=False,
                                     description='Number of people to show:'))

In [None]:
# Helper method so we can wrap it with interact().
def _plot_most_common_text(top_n=10):
    messages_grouped.head(top_n).plot(figsize=(20,10), kind='bar')
   
widgets.interact(_plot_most_common_text,
                 top_n=widgets.IntSlider(min=5, max=100, step=1, value=5, continuous_update=False,
                                         description='Number of people to show:'))

# Steamgraph

### Dump the necessary data to JS

In [None]:
# Restrict to the top N people you text the most so the steamgraph is legible.
TOP_N = 10  # Freely change this value.

sliced_df = fully_merged_messages_df[fully_merged_messages_df.full_name.isin(messages_grouped.head(TOP_N).index)]
grouped_by_month = sliced_df.groupby([
    sliced_df.apply(lambda x: x.date.strftime('%Y/%m'), axis=1),
    'full_name']
)['text'].count().to_frame()

grouped_by_month = grouped_by_month.sort_index()
# We create a dense dataframe for every year/month combination so even if a person didn't text in a specific
# year/month, we have a 0 so the steamgraph can propertly graph the value.
grouped_by_month_dense = grouped_by_month.unstack().fillna(0).stack()

# Dump the dataframe to a global JS variable so we can access it in our JS code.
# TODO(mdezube): Dump out as JSON instead.
formatted_for_steamgraph = grouped_by_month_dense.reset_index(level=1)
formatted_for_steamgraph.index.name = 'date'
formatted_for_steamgraph.columns = ['key', 'value']
Javascript("window.csvAsString='{}'".format(formatted_for_steamgraph.to_csv(index_label='date').replace('\n', '\\n')))

### Draw the graph!

In [None]:
%%javascript
// Draw the streamgraph using d3.

element.append('<div class="chart" style="height:600px; width:100%"></div>')
element.append('<style>.axis path, .axis line' + 
               '{fill: none; stroke: #000;stroke-width: 2px; shape-rendering: crispEdges;}' + 
               '</style>')

element.append("<script src='d3.min.js'></script>")
element.append("<script src='colorbrewer.min.js'></script>")
element.append("<script src='steamgraph.js'></script>")

// Choose your favorite from https://bl.ocks.org/mbostock/5577023
var colorBrewerPalette = "Spectral";

// Set a timeout to let the JS scripts actually load into memory, this is a bit of a hack but works reliably.
setTimeout(function(){createSteamgraph(csvAsString, colorBrewerPalette)}, 200);

# Wordcloud

### Define the helper method

In [None]:
def generate_cloud(texts, max_words=30):
    # Add more words here if you want to ignore them:
    my_stopwords = STOPWORDS.copy()
    my_stopwords.update(['go', 'ya', 'come', 'back', 'good', 'sound'])
    words = ' '.join(texts).lower()
    wordcloud = WordCloud(font_path='CabinSketch-Bold.ttf',
                          stopwords=my_stopwords,
                          background_color='black',
                          width=800,
                          height=600,
                          relative_scaling=1,
                          max_words=max_words
                         ).generate_from_text(words)
    print 'Based on {0:,} texts'.format(len(texts))
    
    fig, ax = plt.subplots(figsize=(15,10))
    ax.imshow(wordcloud)
    ax.axis('off')
    plt.show()

### Texts you've sent

In [None]:
# Word cloud of the top 25 words I use based on the most recent 30,000 messages.

texts_from_me = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1].text[-30000:]
widgets.interact(generate_cloud,
                 texts=widgets.fixed(texts_from_me),
                 max_words=widgets.IntSlider(min=5,max=50,step=1,value=10, continuous_update=False,
                                             description='Max words to include:'))

### Text to/from a specific contact

In [None]:
# Helper method so we can wrap it with interact().
def _word_cloud_specific_contact(max_words, from_me, contact):
    if contact not in full_names:
        print contact + ' not found'
        return
    sliced_df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact) &
                                         (fully_merged_messages_df.is_from_me == from_me)].text
    generate_cloud(sliced_df, max_words)

widgets.interact(_word_cloud_specific_contact,
                 max_words=widgets.IntSlider(min=5,max=50,step=1,value=10, continuous_update=False,
                                             description='Max words to show:'),
                 from_me=widgets.RadioButtons(options={'Show messages FROM me': True,
                                                       'Show messages TO me': False},
                                              description=' '),
                 contact=widgets.Text(value='Mom', description='Contact name:')
                )