In [90]:
# Widen notebook

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Import libraries necessary
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
from time import time
import datetime
from IPython.display import display
import re
%matplotlib inline
import psycopg2
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:.3f}'.format

In [2]:
# Load the messages dataset

messages = pd.read_csv(r'/Users/lisafan/Desktop/BBT/bbt_messages_2017_2020.csv', low_memory = False)

Note that this table only includes outbound messages, and does not include inbound messages.

# Initial Data Exploration and Cleaning

In [7]:
messages['outbound_message_created_at'] = pd.to_datetime(messages['outbound_message_created_at']) # Change message timestamp to datetime format
messages['subscriber_zip_code'] = messages['subscriber_zip_code'].fillna(-1) # Fill NAs in order to convert to integers
messages['subscriber_zip_code'] = messages['subscriber_zip_code'].astype(int).astype(str).str.zfill(5) # Pad zip codes with 0s
messages['subscriber_zip_code'] = messages['subscriber_zip_code'].replace('000-1', np.nan) # Fill NAs again
messages['year'] = messages['outbound_message_created_at'].dt.year

In [4]:
messages.info(null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19130777 entries, 0 to 19130776
Data columns (total 25 columns):
message_id                        19130771 non-null object
outbound_message_created_at       19130777 non-null datetime64[ns]
outbound_message_body             19130777 non-null object
outbound_message_partner_id       19040574 non-null object
message_type                      19130777 non-null object
subscriber_id                     19130777 non-null object
created_at                        19130777 non-null object
signedup_at                       19091472 non-null object
deactivated_at                    1795821 non-null object
carrier_name                      19128550 non-null object
subscriber_zip_code               18436810 non-null object
subscriber_city                   18436810 non-null object
timezone_default_offset           18436810 non-null float64
timezone_name                     18436810 non-null object
subscriber_state                  18436810 non-null

In [8]:
messages.groupby('year')['message_type'].value_counts(normalize = True)

year  message_type
2017  normal         0.720
      signup         0.159
      scheduled      0.113
      national       0.009
2018  normal         0.834
      scheduled      0.081
      signup         0.057
      national       0.019
      survey         0.009
2019  normal         0.558
      scheduled      0.375
      national       0.036
      signup         0.024
      survey         0.007
2020  normal         0.537
      scheduled      0.361
      national       0.061
      survey         0.025
      signup         0.016
Name: message_type, dtype: float64

Of BBT's outgoing messages in 2020:
- 54% are Normal (content messages sent by the system based on DOB) - I think these are also referred to as Program messages
- 36% are Scheduled (messages sent by partners with info about local events, based on age and zip code)
- 6% are National (scheduled messages sent to subscribers by BBT, including those who did not provide their zip code)
- 3% are Signup (messages sent to subscribers during signup)
- 2% are Survey (links to surveys sent to subscribers)

# Timing of Messages

## Adjust message timestamp data to reflect local time zones

In [10]:
min_date = messages['outbound_message_created_at'].dt.date.min()
max_date = messages['outbound_message_created_at'].dt.date.max()

print('The outbound messages dataset includes text messages from {} to {}.'.format(min_date, max_date))

The outbound messages dataset includes text messages from 2017-04-10 to 2020-02-29.


In [11]:
messages['timezone_default_offset'].value_counts()

-7.000     7852330
-5.000     5142388
-6.000     3990853
-8.000     1429026
-9.000       18811
-10.000       3304
-4.000          98
Name: timezone_default_offset, dtype: int64

In [19]:
# Cities in Arizona
pd.DataFrame(messages[(messages['subscriber_state']) == 'Arizona']['subscriber_city'].value_counts().sort_index())

Unnamed: 0,subscriber_city
Ajo,182
Amado,184
Apache Junction,6837
Arizona City,1597
Ash Fork,220
Avondale,34788
Bagdad,1018
Bapchule,501
Black Canyon City,587
Blue Gap,385


In [25]:
navajo = ['Cameron', 'Chinle', 'Fort Defiance', 'Kayenta', 'Leupp', 'Many Farms', 'Pinon', 'Tonalea', 'Tuba City'] # Cities in Navajo Nation

In [33]:
# Label time zones

conditions = [messages['timezone_default_offset'] == -4,
              messages['timezone_default_offset'] == -5, 
              messages['timezone_default_offset'] == -6,  
              (messages['timezone_default_offset'] == -7) & (messages['subscriber_state'] != 'Arizona'),
              (messages['timezone_default_offset'] == -7) & (messages['subscriber_state'] == 'Arizona') & (messages['subscriber_city'].isin(navajo)),
              (messages['timezone_default_offset'] == -7) & (messages['subscriber_state'] == 'Arizona') & (messages['subscriber_city'].isin(navajo) == False),
              messages['timezone_default_offset'] == -8,
              messages['timezone_default_offset'] == -9,
              messages['timezone_default_offset'] == -10]

choices = ['Atlantic', 'Eastern', 'Central', 'Mountain', 'Mountain', 'Arizona', 'Pacific', 'Alaska', 'Hawaii']

messages['timezone'] = np.select(conditions, choices, default ='')
messages

Unnamed: 0,message_id,outbound_message_created_at,outbound_message_body,outbound_message_partner_id,message_type,subscriber_id,created_at,signedup_at,deactivated_at,carrier_name,subscriber_zip_code,subscriber_city,timezone_default_offset,timezone_name,subscriber_state,subscriber_source,subscriber_status,subscriber_language,subscriber_deactivation_method,partner_name,partner_created_date,partner_is_active,partner_state,children_count,scheduled_message_tag,year,timezone
0,5ec45471-2146-456a-acb4-da0d41e6c5bf,2017-05-01 14:40:45.723794,Is this zipcode correct? If yes reply with 'Y...,00000000-0000-0000-0000-000000000002,signup,42cace4c-0d50-407f-8276-609079e560e9,2017-05-01 14:04:57.337456,2017-05-01 14:04:57.337456,,Verizon Wireless,07860,Newton,-5.000,America/New_York,New Jersey,Text SignUp,activated,English,,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,2.000,,2017,Eastern
1,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-01 18:22:11.956767,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,47764722-ac32-4362-9a1a-ce4503f4cf84,2017-05-01 18:16:25.565976,2017-05-01 18:16:25.565976,,Verizon Wireless,78245,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central
2,ffb5de23-d828-4e3a-becf-8b10b80cefba,2017-05-01 18:34:33.615578,Unrecognized date of birth format. Please res...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,8f619c33-e411-460b-8bf8-ea3b4e668ea2,2017-05-01 18:31:47.699809,2017-05-01 18:31:47.699809,2018-04-30 14:46:49.920313,"T-Mobile USA, Inc.",,,,,,Text SignUp,deactivated,English,STOP,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,,,2017,
3,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-02 21:13:27.420256,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,1003849f-4d8e-430b-af34-e5e1c8d8b4f6,2017-05-02 21:10:55.406804,2017-05-02 21:10:55.406804,,AT&T Wireless,78253,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central
4,68c00142-67b6-4c14-93a3-948b425ccc25,2017-05-03 11:38:58.633927,Unrecognized date of birth format. Please res...,00000000-0000-0000-0000-000000000002,signup,d184f7d8-764c-4792-86ba-de17563b3a0b,2017-05-03 10:30:20.168807,2017-05-03 10:30:20.168807,2018-04-30 14:46:53.934041,Verizon Wireless,,,,,,Text SignUp,deactivated,English,STOP,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,,,2017,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19130772,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:21:44.804800,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain
19130773,a0e70d54-86a7-4c55-ac39-69fed5584dc0,2020-02-29 02:22:06.855964,"BBT provides research-based, quality info crea...",c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain
19130774,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:22:37.326561,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain
19130775,9ef2e268-1264-4836-a4c9-20a0997d0eb4,2020-02-29 02:23:30.233406,Thank you for signing up for Bright by Text. (...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain


In [36]:
dst_dict = {
             'Eastern': [pd.Timestamp('2017-11-05T06'), pd.Timestamp('2018-03-11T07'), pd.Timestamp('2018-11-04T06'), 
                         pd.Timestamp('2019-03-10T07'), pd.Timestamp('2019-11-03T06'), pd.Timestamp('2020-03-08T07')],
             'Central': [pd.Timestamp('2017-11-05T07'), pd.Timestamp('2018-03-11T08'), pd.Timestamp('2018-11-04T07'), 
                         pd.Timestamp('2019-03-10T08'), pd.Timestamp('2019-11-03T07'), pd.Timestamp('2020-03-08T08')],
             'Mountain': [pd.Timestamp('2017-11-05T08'), pd.Timestamp('2018-03-11T09'), pd.Timestamp('2018-11-04T08'), 
                         pd.Timestamp('2019-03-10T09'), pd.Timestamp('2019-11-03T08'), pd.Timestamp('2020-03-08T09')],
             'Pacific': [pd.Timestamp('2017-11-05T09'), pd.Timestamp('2018-03-11T10'), pd.Timestamp('2018-11-04T09'), 
                         pd.Timestamp('2019-03-10T10'), pd.Timestamp('2019-11-03T09'), pd.Timestamp('2020-03-08T10')],
             'Alaska': [pd.Timestamp('2017-11-05T10'), pd.Timestamp('2018-03-11T11'), pd.Timestamp('2018-11-04T10'), 
                         pd.Timestamp('2019-03-10T11'), pd.Timestamp('2019-11-03T10'), pd.Timestamp('2020-03-08T11')],
            }

In [68]:
# Puerto Rico (timezone_default_offset = -4) and Hawaii (timezone_default_offset = -10) do not change their clocks for Daylight Savings. 
# Arizona also does not change their clocks for Daylight Savings, with the exception of the Navajo Nation.

def dst(row):
    if row['timezone'] in ['Atlantic', 'Arizona', 'Hawaii', '']:
        return row['timezone_default_offset']
    elif ((row['outbound_message_created_at'] < dst_dict[row['timezone']][0]) | \
          (dst_dict[row['timezone']][1] <= row['outbound_message_created_at'] < dst_dict[row['timezone']][2]) | \
          (dst_dict[row['timezone']][3] <= row['outbound_message_created_at'] < dst_dict[row['timezone']][4]) | \
          (row['outbound_message_created_at'] >= dst_dict[row['timezone']][5])):
        return row['timezone_default_offset'] + 1
    else: 
        return row['timezone_default_offset']

In [69]:
messages['timezone_default_offset_adj'] = messages.apply(dst, axis=1)

In [67]:
messages

Unnamed: 0,message_id,outbound_message_created_at,outbound_message_body,outbound_message_partner_id,message_type,subscriber_id,created_at,signedup_at,deactivated_at,carrier_name,subscriber_zip_code,subscriber_city,timezone_default_offset,timezone_name,subscriber_state,subscriber_source,subscriber_status,subscriber_language,subscriber_deactivation_method,partner_name,partner_created_date,partner_is_active,partner_state,children_count,scheduled_message_tag,year,timezone,timezone_default_offset_adj
0,5ec45471-2146-456a-acb4-da0d41e6c5bf,2017-05-01 14:40:45.723794,Is this zipcode correct? If yes reply with 'Y...,00000000-0000-0000-0000-000000000002,signup,42cace4c-0d50-407f-8276-609079e560e9,2017-05-01 14:04:57.337456,2017-05-01 14:04:57.337456,,Verizon Wireless,07860,Newton,-5.000,America/New_York,New Jersey,Text SignUp,activated,English,,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,2.000,,2017,Eastern,-5.000
1,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-01 18:22:11.956767,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,47764722-ac32-4362-9a1a-ce4503f4cf84,2017-05-01 18:16:25.565976,2017-05-01 18:16:25.565976,,Verizon Wireless,78245,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-6.000
2,ffb5de23-d828-4e3a-becf-8b10b80cefba,2017-05-01 18:34:33.615578,Unrecognized date of birth format. Please res...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,8f619c33-e411-460b-8bf8-ea3b4e668ea2,2017-05-01 18:31:47.699809,2017-05-01 18:31:47.699809,2018-04-30 14:46:49.920313,"T-Mobile USA, Inc.",,,,,,Text SignUp,deactivated,English,STOP,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,,,2017,,
3,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-02 21:13:27.420256,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,1003849f-4d8e-430b-af34-e5e1c8d8b4f6,2017-05-02 21:10:55.406804,2017-05-02 21:10:55.406804,,AT&T Wireless,78253,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-6.000
4,68c00142-67b6-4c14-93a3-948b425ccc25,2017-05-03 11:38:58.633927,Unrecognized date of birth format. Please res...,00000000-0000-0000-0000-000000000002,signup,d184f7d8-764c-4792-86ba-de17563b3a0b,2017-05-03 10:30:20.168807,2017-05-03 10:30:20.168807,2018-04-30 14:46:53.934041,Verizon Wireless,,,,,,Text SignUp,deactivated,English,STOP,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,,,2017,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19130772,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:21:44.804800,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-7.000
19130773,a0e70d54-86a7-4c55-ac39-69fed5584dc0,2020-02-29 02:22:06.855964,"BBT provides research-based, quality info crea...",c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-7.000
19130774,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:22:37.326561,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-7.000
19130775,9ef2e268-1264-4836-a4c9-20a0997d0eb4,2020-02-29 02:23:30.233406,Thank you for signing up for Bright by Text. (...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-7.000


In [70]:
# Download intermediate dataset

messages.to_csv(r'/Users/lisafan/Desktop/BBT/bbt_messages_staging.csv', index=False)  

In [191]:
# Load intermediate dataset

messages = pd.read_csv(r'/Users/lisafan/Desktop/BBT/bbt_messages_staging.csv', low_memory = False)

In [192]:
# Data cleaning

messages['outbound_message_created_at'] = pd.to_datetime(messages['outbound_message_created_at']) # Change message timestamp to datetime format
messages['signedup_at'] = pd.to_datetime(messages['signedup_at']) # Change signup timestamp to datetime format
messages['deactivated_at'] = pd.to_datetime(messages['deactivated_at']) # Change deactivation timestamp to datetime format
messages['created_at'] = pd.to_datetime(messages['created_at']) # Change created at timestamp to datetime format
messages['subscriber_zip_code'] = messages['subscriber_zip_code'].fillna(-1) # Fill NAs in order to convert to integers
messages['subscriber_zip_code'] = messages['subscriber_zip_code'].astype(int).astype(str).str.zfill(5) # Pad zip codes with 0s
messages['subscriber_zip_code'] = messages['subscriber_zip_code'].replace('000-1', np.nan) # Fill NAs again

In [193]:
# Adjust message time stamps for timezones

messages['timezone_default_offset_adj'] = pd.to_timedelta(messages['timezone_default_offset_adj'],'h')
messages['outbound_message_created_at_adj'] = messages['outbound_message_created_at'] + messages['timezone_default_offset_adj']
messages

Unnamed: 0,message_id,outbound_message_created_at,outbound_message_body,outbound_message_partner_id,message_type,subscriber_id,created_at,signedup_at,deactivated_at,carrier_name,subscriber_zip_code,subscriber_city,timezone_default_offset,timezone_name,subscriber_state,subscriber_source,subscriber_status,subscriber_language,subscriber_deactivation_method,partner_name,partner_created_date,partner_is_active,partner_state,children_count,scheduled_message_tag,year,timezone,timezone_default_offset_adj,outbound_message_created_at_adj
0,5ec45471-2146-456a-acb4-da0d41e6c5bf,2017-05-01 14:40:45.723794,Is this zipcode correct? If yes reply with 'Y...,00000000-0000-0000-0000-000000000002,signup,42cace4c-0d50-407f-8276-609079e560e9,2017-05-01 14:04:57.337456,2017-05-01 14:04:57.337456,NaT,Verizon Wireless,07860,Newton,-5.000,America/New_York,New Jersey,Text SignUp,activated,English,,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,2.000,,2017,Eastern,-1 days +20:00:00,2017-05-01 10:40:45.723794
1,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-01 18:22:11.956767,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,47764722-ac32-4362-9a1a-ce4503f4cf84,2017-05-01 18:16:25.565976,2017-05-01 18:16:25.565976,NaT,Verizon Wireless,78245,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-1 days +19:00:00,2017-05-01 13:22:11.956767
2,ffb5de23-d828-4e3a-becf-8b10b80cefba,2017-05-01 18:34:33.615578,Unrecognized date of birth format. Please res...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,8f619c33-e411-460b-8bf8-ea3b4e668ea2,2017-05-01 18:31:47.699809,2017-05-01 18:31:47.699809,2018-04-30 14:46:49.920313,"T-Mobile USA, Inc.",,,,,,Text SignUp,deactivated,English,STOP,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,,,2017,,NaT,NaT
3,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-02 21:13:27.420256,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,1003849f-4d8e-430b-af34-e5e1c8d8b4f6,2017-05-02 21:10:55.406804,2017-05-02 21:10:55.406804,NaT,AT&T Wireless,78253,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-1 days +19:00:00,2017-05-02 16:13:27.420256
4,68c00142-67b6-4c14-93a3-948b425ccc25,2017-05-03 11:38:58.633927,Unrecognized date of birth format. Please res...,00000000-0000-0000-0000-000000000002,signup,d184f7d8-764c-4792-86ba-de17563b3a0b,2017-05-03 10:30:20.168807,2017-05-03 10:30:20.168807,2018-04-30 14:46:53.934041,Verizon Wireless,,,,,,Text SignUp,deactivated,English,STOP,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,,,2017,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19130772,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:21:44.804800,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:21:44.804800
19130773,a0e70d54-86a7-4c55-ac39-69fed5584dc0,2020-02-29 02:22:06.855964,"BBT provides research-based, quality info crea...",c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:22:06.855964
19130774,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:22:37.326561,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:22:37.326561
19130775,9ef2e268-1264-4836-a4c9-20a0997d0eb4,2020-02-29 02:23:30.233406,Thank you for signing up for Bright by Text. (...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:23:30.233406


In [194]:
messages['outbound_message_created_at_adj'] = pd.to_datetime(messages['outbound_message_created_at_adj']) 
day_dict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
messages['outbound_message_created_at_day'] = messages['outbound_message_created_at_adj'].dt.dayofweek.map(day_dict)
messages

Unnamed: 0,message_id,outbound_message_created_at,outbound_message_body,outbound_message_partner_id,message_type,subscriber_id,created_at,signedup_at,deactivated_at,carrier_name,subscriber_zip_code,subscriber_city,timezone_default_offset,timezone_name,subscriber_state,subscriber_source,subscriber_status,subscriber_language,subscriber_deactivation_method,partner_name,partner_created_date,partner_is_active,partner_state,children_count,scheduled_message_tag,year,timezone,timezone_default_offset_adj,outbound_message_created_at_adj,outbound_message_created_at_day
0,5ec45471-2146-456a-acb4-da0d41e6c5bf,2017-05-01 14:40:45.723794,Is this zipcode correct? If yes reply with 'Y...,00000000-0000-0000-0000-000000000002,signup,42cace4c-0d50-407f-8276-609079e560e9,2017-05-01 14:04:57.337456,2017-05-01 14:04:57.337456,NaT,Verizon Wireless,07860,Newton,-5.000,America/New_York,New Jersey,Text SignUp,activated,English,,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,2.000,,2017,Eastern,-1 days +20:00:00,2017-05-01 10:40:45.723794,Monday
1,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-01 18:22:11.956767,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,47764722-ac32-4362-9a1a-ce4503f4cf84,2017-05-01 18:16:25.565976,2017-05-01 18:16:25.565976,NaT,Verizon Wireless,78245,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-1 days +19:00:00,2017-05-01 13:22:11.956767,Monday
2,ffb5de23-d828-4e3a-becf-8b10b80cefba,2017-05-01 18:34:33.615578,Unrecognized date of birth format. Please res...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,8f619c33-e411-460b-8bf8-ea3b4e668ea2,2017-05-01 18:31:47.699809,2017-05-01 18:31:47.699809,2018-04-30 14:46:49.920313,"T-Mobile USA, Inc.",,,,,,Text SignUp,deactivated,English,STOP,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,,,2017,,NaT,NaT,
3,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-02 21:13:27.420256,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,1003849f-4d8e-430b-af34-e5e1c8d8b4f6,2017-05-02 21:10:55.406804,2017-05-02 21:10:55.406804,NaT,AT&T Wireless,78253,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-1 days +19:00:00,2017-05-02 16:13:27.420256,Tuesday
4,68c00142-67b6-4c14-93a3-948b425ccc25,2017-05-03 11:38:58.633927,Unrecognized date of birth format. Please res...,00000000-0000-0000-0000-000000000002,signup,d184f7d8-764c-4792-86ba-de17563b3a0b,2017-05-03 10:30:20.168807,2017-05-03 10:30:20.168807,2018-04-30 14:46:53.934041,Verizon Wireless,,,,,,Text SignUp,deactivated,English,STOP,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,,,2017,,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19130772,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:21:44.804800,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:21:44.804800,Friday
19130773,a0e70d54-86a7-4c55-ac39-69fed5584dc0,2020-02-29 02:22:06.855964,"BBT provides research-based, quality info crea...",c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:22:06.855964,Friday
19130774,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:22:37.326561,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:22:37.326561,Friday
19130775,9ef2e268-1264-4836-a4c9-20a0997d0eb4,2020-02-29 02:23:30.233406,Thank you for signing up for Bright by Text. (...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:23:30.233406,Friday


### Day of Week Sent

In [195]:
messages.groupby('year')['outbound_message_created_at_day'].value_counts(normalize = True)

year  outbound_message_created_at_day
2017  Tuesday                           0.163
      Friday                            0.157
      Wednesday                         0.148
      Monday                            0.143
      Thursday                          0.136
      Saturday                          0.133
      Sunday                            0.121
2018  Friday                            0.153
      Tuesday                           0.148
      Thursday                          0.147
      Wednesday                         0.147
      Monday                            0.144
      Saturday                          0.135
      Sunday                            0.126
2019  Friday                            0.170
      Thursday                          0.165
      Wednesday                         0.160
      Monday                            0.154
      Tuesday                           0.130
      Saturday                          0.126
      Sunday                            0.

The most common day messages are sent seems to vary over time. Messages are sent more frequently during the weekdays.

In [196]:
messages.groupby('message_type')['outbound_message_created_at_day'].value_counts(normalize = True)

message_type  outbound_message_created_at_day
national      Tuesday                           0.280
              Friday                            0.235
              Saturday                          0.222
              Thursday                          0.167
              Wednesday                         0.096
normal        Wednesday                         0.145
              Friday                            0.144
              Monday                            0.143
              Tuesday                           0.143
              Saturday                          0.143
              Thursday                          0.142
              Sunday                            0.140
scheduled     Friday                            0.201
              Wednesday                         0.192
              Thursday                          0.188
              Monday                            0.173
              Tuesday                           0.113
              Saturday              

In [197]:
# 2020 only

messages[messages['year'] == 2020].groupby('message_type')['outbound_message_created_at_day'].value_counts(normalize = True)

message_type  outbound_message_created_at_day
national      Thursday                          0.501
              Tuesday                           0.499
normal        Friday                            0.153
              Wednesday                         0.153
              Thursday                          0.153
              Sunday                            0.136
              Saturday                          0.136
              Monday                            0.135
              Tuesday                           0.134
scheduled     Wednesday                         0.231
              Friday                            0.181
              Tuesday                           0.159
              Monday                            0.125
              Sunday                            0.124
              Thursday                          0.115
              Saturday                          0.065
signup        Thursday                          0.177
              Tuesday               

- National messages: Overall, most messages were sent out Tuesday, Friday, or Saturday. In 2020, half of national messages were sent out on Thursday and half were sent out on Tuesday, with no messages sent out on any other day.
- Normal messages: Messages are sent pretty evenly across the seven days of the week. 
- Scheduled messages: The most messages are sent out on Friday and Wednesday. The least number of messages are sent out on Tuesday, Saturday, and Sunday. In 2020, the most messages were sent out on Wednesday, followed by Friday, and the least number of messages were sent out on Saturday.
- Signup messages: Most messages are sent out Monday through Thursday. The least number of messages are sent out on Saturday and Sunday. 
- Survey messages: The majority of messages are sent on Tuesday, followed by Thursday, with barely any messages sent on any of the other days. In 2020, survey messages have almost always been sent out on Tuesday.

### Time of Day Sent

In [198]:
messages['outbound_message_created_at_hour'] = messages['outbound_message_created_at_adj'].dt.hour
messages['outbound_message_created_at_hour'].value_counts(normalize = True)

11.000   0.379
15.000   0.178
14.000   0.106
10.000   0.058
17.000   0.052
12.000   0.044
13.000   0.044
9.000    0.040
16.000   0.034
18.000   0.019
8.000    0.016
7.000    0.013
19.000   0.010
6.000    0.002
20.000   0.002
21.000   0.001
22.000   0.001
23.000   0.000
5.000    0.000
0.000    0.000
1.000    0.000
4.000    0.000
3.000    0.000
2.000    0.000
Name: outbound_message_created_at_hour, dtype: float64

In [199]:
messages[messages['year'] == 2020]['outbound_message_created_at_hour'].value_counts(normalize = True)

11.000   0.578
15.000   0.061
13.000   0.050
17.000   0.044
12.000   0.043
9.000    0.041
14.000   0.037
10.000   0.034
8.000    0.031
7.000    0.030
16.000   0.020
18.000   0.015
19.000   0.013
6.000    0.001
20.000   0.001
21.000   0.000
22.000   0.000
23.000   0.000
0.000    0.000
5.000    0.000
2.000    0.000
1.000    0.000
3.000    0.000
4.000    0.000
Name: outbound_message_created_at_hour, dtype: float64

The majority of messages are sent at 11am local time.

In [200]:
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:.3f}'.format
pd.DataFrame(messages.groupby('message_type')['outbound_message_created_at_hour'].value_counts(normalize = True))

Unnamed: 0_level_0,Unnamed: 1_level_0,outbound_message_created_at_hour
message_type,outbound_message_created_at_hour,Unnamed: 2_level_1
national,12.0,0.204
national,13.0,0.177
national,11.0,0.149
national,14.0,0.143
national,10.0,0.095
national,15.0,0.092
national,16.0,0.044
national,17.0,0.036
national,9.0,0.032
national,18.0,0.008


In [201]:
pd.DataFrame(messages[messages['year'] == 2020].groupby('message_type')['outbound_message_created_at_hour'].value_counts(normalize = True))

Unnamed: 0_level_0,Unnamed: 1_level_0,outbound_message_created_at_hour
message_type,outbound_message_created_at_hour,Unnamed: 2_level_1
national,15.0,0.161
national,12.0,0.148
national,17.0,0.145
national,13.0,0.142
national,16.0,0.136
national,11.0,0.128
national,14.0,0.092
national,10.0,0.027
national,18.0,0.02
national,9.0,0.001


- National messages: 20% of national messages are sent at noon local time. 97% of all national messages are sent between 9am-5pm local time. In 2020, national messages were most frequently sent at 3pm local time. Essentially all national messages were sent between 10am-6pm local time in 2020.
- Normal messages: 50% of all normal messages are all sent at 11am local time. All normal messages were sent at 11am local time in 2020.
- Scheduled messages: Scheduled messages are sent most frequently at 10am and 11am. 79% are sent between 9am-3pm local time. In 2020, scheduled messages were sent most frequently at 11am.
- Signup messages: Signup messages are sent at a variety of times, most often sent between 9am-11am. 92% of messages are sent between 7am-5pm. In 2020, signup messages were sent most frequently between 11am-1pm.
- Survey messages: Survey messages are mostly sent at 3pm local time. In 2020, they were all sent at 3pm local time.

## Frequency of Messages

In [202]:
messages['days_since_deactivated'] = messages['deactivated_at'] - messages['created_at']
messages['days_since_signup'] = pd.Timestamp('2020-02-29T08') - messages['created_at']
messages['subscriber_days'] = messages[['days_since_deactivated', 'days_since_signup']].min(axis = 1)
messages['subscriber_days'] = messages['subscriber_days'] / datetime.timedelta(days = 1)
messages['subscriber_weeks'] = messages['subscriber_days'] / 7
messages

Unnamed: 0,message_id,outbound_message_created_at,outbound_message_body,outbound_message_partner_id,message_type,subscriber_id,created_at,signedup_at,deactivated_at,carrier_name,subscriber_zip_code,subscriber_city,timezone_default_offset,timezone_name,subscriber_state,subscriber_source,subscriber_status,subscriber_language,subscriber_deactivation_method,partner_name,partner_created_date,partner_is_active,partner_state,children_count,scheduled_message_tag,year,timezone,timezone_default_offset_adj,outbound_message_created_at_adj,outbound_message_created_at_day,outbound_message_created_at_hour,days_since_deactivated,days_since_signup,subscriber_days,subscriber_weeks
0,5ec45471-2146-456a-acb4-da0d41e6c5bf,2017-05-01 14:40:45.723794,Is this zipcode correct? If yes reply with 'Y...,00000000-0000-0000-0000-000000000002,signup,42cace4c-0d50-407f-8276-609079e560e9,2017-05-01 14:04:57.337456,2017-05-01 14:04:57.337456,NaT,Verizon Wireless,07860,Newton,-5.000,America/New_York,New Jersey,Text SignUp,activated,English,,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,2.000,,2017,Eastern,-1 days +20:00:00,2017-05-01 10:40:45.723794,Monday,10.000,NaT,1033 days 17:55:02.662544,1033.747,147.678
1,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-01 18:22:11.956767,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,47764722-ac32-4362-9a1a-ce4503f4cf84,2017-05-01 18:16:25.565976,2017-05-01 18:16:25.565976,NaT,Verizon Wireless,78245,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-1 days +19:00:00,2017-05-01 13:22:11.956767,Monday,13.000,NaT,1033 days 13:43:34.434024,1033.572,147.653
2,ffb5de23-d828-4e3a-becf-8b10b80cefba,2017-05-01 18:34:33.615578,Unrecognized date of birth format. Please res...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,8f619c33-e411-460b-8bf8-ea3b4e668ea2,2017-05-01 18:31:47.699809,2017-05-01 18:31:47.699809,2018-04-30 14:46:49.920313,"T-Mobile USA, Inc.",,,,,,Text SignUp,deactivated,English,STOP,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,,,2017,,NaT,NaT,,,363 days 20:15:02.220504,1033 days 13:28:12.300191,363.844,51.978
3,00f97ed1-d95f-47a7-8738-93d2ec8be5f1,2017-05-02 21:13:27.420256,Is this zipcode correct? If yes reply with 'Y...,f92494a8-bcce-473f-a56e-0ae2515c827e,signup,1003849f-4d8e-430b-af34-e5e1c8d8b4f6,2017-05-02 21:10:55.406804,2017-05-02 21:10:55.406804,NaT,AT&T Wireless,78253,San Antonio,-6.000,America/Chicago,Texas,Text SignUp,activated,English,,KLRN - San Antonio,2018-08-28 20:58:04.700242,True,Texas,1.000,,2017,Central,-1 days +19:00:00,2017-05-02 16:13:27.420256,Tuesday,16.000,NaT,1032 days 10:49:04.593196,1032.451,147.493
4,68c00142-67b6-4c14-93a3-948b425ccc25,2017-05-03 11:38:58.633927,Unrecognized date of birth format. Please res...,00000000-0000-0000-0000-000000000002,signup,d184f7d8-764c-4792-86ba-de17563b3a0b,2017-05-03 10:30:20.168807,2017-05-03 10:30:20.168807,2018-04-30 14:46:53.934041,Verizon Wireless,,,,,,Text SignUp,deactivated,English,STOP,"WNET - New York, NY",2018-03-25 21:32:21.001262,True,New York,,,2017,,NaT,NaT,,,362 days 04:16:33.765234,1031 days 21:29:39.831193,362.178,51.740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19130772,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:21:44.804800,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:21:44.804800,Friday,19.000,NaT,0 days 05:38:39.546064,0.235,0.034
19130773,a0e70d54-86a7-4c55-ac39-69fed5584dc0,2020-02-29 02:22:06.855964,"BBT provides research-based, quality info crea...",c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:22:06.855964,Friday,19.000,NaT,0 days 05:38:39.546064,0.235,0.034
19130774,144f86ab-0398-42bb-81cb-e0420a4d6ba0,2020-02-29 02:22:37.326561,BBT tips and resources are tailored to the age...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:22:37.326561,Friday,19.000,NaT,0 days 05:38:39.546064,0.235,0.034
19130775,9ef2e268-1264-4836-a4c9-20a0997d0eb4,2020-02-29 02:23:30.233406,Thank you for signing up for Bright by Text. (...,c67af927-93ee-42ff-be90-db2dab4db333,signup,18877b1e-4058-4179-8644-3ad77cf43bd8,2020-02-29 02:21:20.453936,2020-02-29 02:23:29.972976,NaT,"T-Mobile USA, Inc.",80239,Denver,-7.000,America/Denver,Colorado,SMS/Web,activated,English,,CO Best Start Program,2018-11-01 16:29:02.722421,True,Colorado,1.000,,2020,Mountain,-1 days +17:00:00,2020-02-28 19:23:30.233406,Friday,19.000,NaT,0 days 05:38:39.546064,0.235,0.034


In [203]:
messages_per_subscriber_zip_code = messages.groupby('subscriber_zip_code').agg({'subscriber_id':'nunique', 'message_id':'count', 'subscriber_weeks': 'sum'}).reset_index()
messages_per_subscriber_zip_code.columns = ['subscriber_zip_code', 'subscriber_count', 'message_count', 'subscriber_weeks']
messages_per_subscriber_zip_code['messages_per_subscriber_per_week'] = (messages_per_subscriber_zip_code['message_count'] / messages_per_subscriber_zip_code['subscriber_weeks'] ) / messages_per_subscriber_zip_code['subscriber_count']
messages_per_subscriber_zip_code

Unnamed: 0,subscriber_zip_code,subscriber_count,message_count,subscriber_weeks,messages_per_subscriber_per_week
0,00501,1,53,4890.497,0.011
1,00544,1,39,3488.164,0.011
2,00960,1,98,3930.427,0.025
3,01011,1,33,422.470,0.078
4,01060,1,326,27080.735,0.012
...,...,...,...,...,...
8175,99801,34,1388,26405.217,0.002
8176,99803,3,118,1876.232,0.021
8177,99824,1,4,1.676,2.386
8178,99833,1,15,41.919,0.358


In [204]:
print("There is an average of {:,.3f} messages per subscriber sent out per week per zip code.".format(messages_per_subscriber_zip_code['messages_per_subscriber_per_week'].mean()))
print("There is a median of {:,.3f} messages per subscriber sent out per week per zip code.".format(messages_per_subscriber_zip_code['messages_per_subscriber_per_week'].median()))

There is an average of 29.154 messages per subscriber sent out per week per zip code.
There is a median of 0.008 messages per subscriber sent out per week per zip code.


In [205]:
messages['scheduled_message_tag'].value_counts()

event                                           882372
social connections                              852841
knowledge of parenting and child development    803146
education resource                              760587
social and emotional competence of children     540470
parental resilience                             226535
concrete support in times of need               213746
health resource                                 163022
other community referral resource               129360
public service announcement                      56018
Name: scheduled_message_tag, dtype: int64

In [206]:
# Add columns based on scheduled_message_tags

tags = ['event', 'social connections', 'knowledge of parenting and child development', 'education resource', 'social and emotional competence of children', 'parental resilience', 'concrete support in times of need', 'health resource', 'other community referral resource', 'public service announcement']

for tag in tags:
    messages['tag_' + tag] = np.where(messages['scheduled_message_tag'] == tag, 1, 0)

messages.columns = messages.columns.str.replace(' ', '_')

In [207]:
# Messages per subscriber

messages_per_subscriber = messages.groupby('subscriber_id').agg({'message_id':'count', 'subscriber_weeks': 'sum'}).reset_index()
messages_per_subscriber.columns = ['subscriber_id', 'message_count', 'subscriber_weeks']
messages_per_subscriber['messages_per_subscriber_per_week'] = messages_per_subscriber['message_count'] / messages_per_subscriber['subscriber_weeks']
messages_per_subscriber

Unnamed: 0,subscriber_id,message_count,subscriber_weeks,messages_per_subscriber_per_week
0,000018d6-c7fe-4b3c-8bf7-8502e4ae2d9b,293,25539.841,0.011
1,00005a8a-477e-46c6-82f9-e4678760599c,48,959.152,0.050
2,0000661e-1c9c-4dde-a9f6-b2a8291142fd,119,3511.674,0.034
3,00007706-12cc-48e8-a732-649de7ec8b50,114,2210.440,0.052
4,000128d7-701e-4919-ad93-fd2ef3a5f69d,251,13069.596,0.019
...,...,...,...,...
106780,fffc68c4-e8b7-4060-bce9-e6b6ca74b00c,322,32409.833,0.010
106781,fffd402d-4bbd-4322-8389-a159f1dfa038,18,25.098,0.717
106782,fffdbcab-63d2-4158-9743-bbb138036518,6,1.910,3.141
106783,fffde26d-0880-4ca0-a67e-46dcd77f4dab,508,71466.710,0.007


## Linking Texts to Landing Page URL in Google Analytics

In [None]:
messages['bitly_link'] = messages['outbound_message_body'].str.extract(r'.*(bit.ly\/.*)', expand=True)

# Analyze correlation of various factors to deactivation

In [208]:
# Message tags

columns = ['subscriber_id'] + [col for col in messages if col.startswith('tag')]

tags_per_subscriber = messages[columns].groupby('subscriber_id').agg('max').reset_index()
tags_per_subscriber.columns = columns
tags_per_subscriber

Unnamed: 0,subscriber_id,tag_event,tag_social_connections,tag_knowledge_of_parenting_and_child_development,tag_education_resource,tag_social_and_emotional_competence_of_children,tag_parental_resilience,tag_concrete_support_in_times_of_need,tag_health_resource,tag_other_community_referral_resource,tag_public_service_announcement
0,000018d6-c7fe-4b3c-8bf7-8502e4ae2d9b,0,0,0,0,0,0,0,0,0,0
1,00005a8a-477e-46c6-82f9-e4678760599c,0,0,0,0,0,0,0,0,0,0
2,0000661e-1c9c-4dde-a9f6-b2a8291142fd,1,1,1,1,1,1,1,1,1,1
3,00007706-12cc-48e8-a732-649de7ec8b50,0,0,0,0,0,0,0,0,0,0
4,000128d7-701e-4919-ad93-fd2ef3a5f69d,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
106780,fffc68c4-e8b7-4060-bce9-e6b6ca74b00c,1,1,1,1,1,1,1,0,1,1
106781,fffd402d-4bbd-4322-8389-a159f1dfa038,1,1,1,1,1,0,0,0,0,0
106782,fffdbcab-63d2-4158-9743-bbb138036518,0,0,0,0,0,0,0,0,0,0
106783,fffde26d-0880-4ca0-a67e-46dcd77f4dab,1,1,1,1,1,1,1,1,1,1


In [209]:
messages['stop'] = np.where(messages['subscriber_deactivation_method'] == 'STOP', 1, 0)
stop = messages.groupby('subscriber_id')['stop'].max().reset_index()
stop.columns = ['subscriber_id', 'stop']
stop

Unnamed: 0,subscriber_id,stop
0,000018d6-c7fe-4b3c-8bf7-8502e4ae2d9b,1
1,00005a8a-477e-46c6-82f9-e4678760599c,1
2,0000661e-1c9c-4dde-a9f6-b2a8291142fd,0
3,00007706-12cc-48e8-a732-649de7ec8b50,0
4,000128d7-701e-4919-ad93-fd2ef3a5f69d,0
...,...,...
106780,fffc68c4-e8b7-4060-bce9-e6b6ca74b00c,0
106781,fffd402d-4bbd-4322-8389-a159f1dfa038,0
106782,fffdbcab-63d2-4158-9743-bbb138036518,0
106783,fffde26d-0880-4ca0-a67e-46dcd77f4dab,0


In [210]:
a = messages_per_subscriber.merge(tags_per_subscriber, left_on = 'subscriber_id', right_on = 'subscriber_id', left_index = True, right_index = False)
message_corr = a.merge(stop, left_on = 'subscriber_id', right_on = 'subscriber_id', left_index = True, right_index = False)
message_corr

Unnamed: 0,subscriber_id,message_count,subscriber_weeks,messages_per_subscriber_per_week,tag_event,tag_social_connections,tag_knowledge_of_parenting_and_child_development,tag_education_resource,tag_social_and_emotional_competence_of_children,tag_parental_resilience,tag_concrete_support_in_times_of_need,tag_health_resource,tag_other_community_referral_resource,tag_public_service_announcement,stop
0,000018d6-c7fe-4b3c-8bf7-8502e4ae2d9b,293,25539.841,0.011,0,0,0,0,0,0,0,0,0,0,1
1,00005a8a-477e-46c6-82f9-e4678760599c,48,959.152,0.050,0,0,0,0,0,0,0,0,0,0,1
2,0000661e-1c9c-4dde-a9f6-b2a8291142fd,119,3511.674,0.034,1,1,1,1,1,1,1,1,1,1,0
3,00007706-12cc-48e8-a732-649de7ec8b50,114,2210.440,0.052,0,0,0,0,0,0,0,0,0,0,0
4,000128d7-701e-4919-ad93-fd2ef3a5f69d,251,13069.596,0.019,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106780,fffc68c4-e8b7-4060-bce9-e6b6ca74b00c,322,32409.833,0.010,1,1,1,1,1,1,1,0,1,1,0
106781,fffd402d-4bbd-4322-8389-a159f1dfa038,18,25.098,0.717,1,1,1,1,1,0,0,0,0,0,0
106782,fffdbcab-63d2-4158-9743-bbb138036518,6,1.910,3.141,0,0,0,0,0,0,0,0,0,0,0
106783,fffde26d-0880-4ca0-a67e-46dcd77f4dab,508,71466.710,0.007,1,1,1,1,1,1,1,1,1,1,0


In [211]:
message_corr = message_corr.drop(columns = ['subscriber_id', 'message_count', 'subscriber_weeks'])
message_corr.corr()

Unnamed: 0,messages_per_subscriber_per_week,tag_event,tag_social_connections,tag_knowledge_of_parenting_and_child_development,tag_education_resource,tag_social_and_emotional_competence_of_children,tag_parental_resilience,tag_concrete_support_in_times_of_need,tag_health_resource,tag_other_community_referral_resource,tag_public_service_announcement,stop
messages_per_subscriber_per_week,1.0,-0.064,-0.063,-0.07,-0.07,-0.062,-0.048,-0.048,-0.049,-0.05,-0.033,0.065
tag_event,-0.064,1.0,0.98,0.871,0.872,0.908,0.722,0.721,0.678,0.732,0.484,-0.359
tag_social_connections,-0.063,0.98,1.0,0.855,0.857,0.911,0.719,0.719,0.668,0.732,0.485,-0.356
tag_knowledge_of_parenting_and_child_development,-0.07,0.871,0.855,1.0,0.975,0.869,0.684,0.677,0.691,0.692,0.46,-0.402
tag_education_resource,-0.07,0.872,0.857,0.975,1.0,0.875,0.689,0.668,0.685,0.693,0.449,-0.4
tag_social_and_emotional_competence_of_children,-0.062,0.908,0.911,0.869,0.875,1.0,0.73,0.715,0.697,0.739,0.484,-0.363
tag_parental_resilience,-0.048,0.722,0.719,0.684,0.689,0.73,1.0,0.724,0.812,0.672,0.402,-0.302
tag_concrete_support_in_times_of_need,-0.048,0.721,0.719,0.677,0.668,0.715,0.724,1.0,0.634,0.8,0.652,-0.301
tag_health_resource,-0.049,0.678,0.668,0.691,0.685,0.697,0.812,0.634,1.0,0.598,0.325,-0.337
tag_other_community_referral_resource,-0.05,0.732,0.732,0.692,0.693,0.739,0.672,0.8,0.598,1.0,0.569,-0.324


It looks like STOP is slightly positively correlated with messages per subscriber per week, and negatively correlated with all tags.

## Add in number of children as a variable

In [217]:
children = messages[messages['children_count'] >= 1].groupby('subscriber_id')['children_count'].agg('max').reset_index()
children.columns = ['subscriber_id', 'children_count']
children

Unnamed: 0,subscriber_id,children_count
0,000018d6-c7fe-4b3c-8bf7-8502e4ae2d9b,1.000
1,00005a8a-477e-46c6-82f9-e4678760599c,1.000
2,0000661e-1c9c-4dde-a9f6-b2a8291142fd,1.000
3,00007706-12cc-48e8-a732-649de7ec8b50,2.000
4,000128d7-701e-4919-ad93-fd2ef3a5f69d,1.000
...,...,...
92985,fffb736f-0273-4f16-8151-8ded8a8b1aa5,2.000
92986,fffc68c4-e8b7-4060-bce9-e6b6ca74b00c,1.000
92987,fffd402d-4bbd-4322-8389-a159f1dfa038,1.000
92988,fffde26d-0880-4ca0-a67e-46dcd77f4dab,1.000


In [218]:
a = children.merge(messages_per_subscriber, how = 'left', left_on = 'subscriber_id', right_on = 'subscriber_id', left_index = True, right_index = False)
b = a.merge(tags_per_subscriber, left_on = 'subscriber_id', right_on = 'subscriber_id', left_index = True, right_index = False)
message_corr = b.merge(stop, left_on = 'subscriber_id', right_on = 'subscriber_id', left_index = True, right_index = False)
message_corr

Unnamed: 0,subscriber_id,children_count,message_count,subscriber_weeks,messages_per_subscriber_per_week,tag_event,tag_social_connections,tag_knowledge_of_parenting_and_child_development,tag_education_resource,tag_social_and_emotional_competence_of_children,tag_parental_resilience,tag_concrete_support_in_times_of_need,tag_health_resource,tag_other_community_referral_resource,tag_public_service_announcement,stop
0,000018d6-c7fe-4b3c-8bf7-8502e4ae2d9b,1.000,293,25539.841,0.011,0,0,0,0,0,0,0,0,0,0,1
1,00005a8a-477e-46c6-82f9-e4678760599c,1.000,48,959.152,0.050,0,0,0,0,0,0,0,0,0,0,1
2,0000661e-1c9c-4dde-a9f6-b2a8291142fd,1.000,119,3511.674,0.034,1,1,1,1,1,1,1,1,1,1,0
3,00007706-12cc-48e8-a732-649de7ec8b50,2.000,114,2210.440,0.052,0,0,0,0,0,0,0,0,0,0,0
4,000128d7-701e-4919-ad93-fd2ef3a5f69d,1.000,251,13069.596,0.019,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106779,fffb736f-0273-4f16-8151-8ded8a8b1aa5,2.000,195,4971.629,0.039,1,1,1,1,1,1,0,1,1,0,0
106780,fffc68c4-e8b7-4060-bce9-e6b6ca74b00c,1.000,322,32409.833,0.010,1,1,1,1,1,1,1,0,1,1,0
106781,fffd402d-4bbd-4322-8389-a159f1dfa038,1.000,18,25.098,0.717,1,1,1,1,1,0,0,0,0,0,0
106783,fffde26d-0880-4ca0-a67e-46dcd77f4dab,1.000,508,71466.710,0.007,1,1,1,1,1,1,1,1,1,1,0


In [219]:
message_corr = message_corr.drop(columns = ['subscriber_id', 'message_count', 'subscriber_weeks'])
message_corr.corr()

Unnamed: 0,children_count,messages_per_subscriber_per_week,tag_event,tag_social_connections,tag_knowledge_of_parenting_and_child_development,tag_education_resource,tag_social_and_emotional_competence_of_children,tag_parental_resilience,tag_concrete_support_in_times_of_need,tag_health_resource,tag_other_community_referral_resource,tag_public_service_announcement,stop
children_count,1.0,0.009,-0.005,-0.006,-0.005,-0.006,-0.005,0.021,-0.006,0.019,-0.024,-0.024,0.012
messages_per_subscriber_per_week,0.009,1.0,-0.096,-0.094,-0.108,-0.107,-0.091,-0.068,-0.067,-0.068,-0.07,-0.044,0.125
tag_event,-0.005,-0.096,1.0,0.974,0.826,0.828,0.881,0.668,0.667,0.613,0.676,0.43,-0.419
tag_social_connections,-0.006,-0.094,0.974,1.0,0.807,0.809,0.885,0.665,0.666,0.602,0.677,0.431,-0.413
tag_knowledge_of_parenting_and_child_development,-0.005,-0.108,0.826,0.807,1.0,0.965,0.829,0.62,0.611,0.627,0.626,0.401,-0.485
tag_education_resource,-0.006,-0.107,0.828,0.809,0.965,1.0,0.837,0.627,0.601,0.62,0.626,0.387,-0.481
tag_social_and_emotional_competence_of_children,-0.005,-0.091,0.881,0.885,0.829,0.837,1.0,0.679,0.661,0.638,0.686,0.43,-0.419
tag_parental_resilience,0.021,-0.068,0.668,0.665,0.62,0.627,0.679,1.0,0.681,0.783,0.619,0.345,-0.333
tag_concrete_support_in_times_of_need,-0.006,-0.067,0.667,0.666,0.611,0.601,0.661,0.681,1.0,0.576,0.767,0.623,-0.332
tag_health_resource,0.019,-0.068,0.613,0.602,0.627,0.62,0.638,0.783,0.576,1.0,0.531,0.259,-0.374


Number of children is slightly positively correlated with STOP, as well as messages per subscribers per week (which makes sense). 

# Next Steps

- Turn days/times messages are sent into visualizations
- Analyze frequency of messages based on age of children
- Find distribution of amount of time before deactivation
- Create visualizations indicating frequency of messages and types of messages received before deactivation 
- Create visualization showing messages/week for non-deactivated vs. messages/week for deactivated
- Analyze inbound messages
- Analyze effect of day/time of messages on STOP rates
- Nim mentioned interest in doing some analysis on inbound messages to see what people are texting in, to potentially help them implement a chatbot. Maybe we can do some simple content analysis here.