In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def subtract_df(df1, df2, on):
    """
    Subtracts the rows of df2 from df1 based on a common column.

    Parameters:
    df1 (pandas.DataFrame): The DataFrame to subtract from.
    df2 (pandas.DataFrame): The DataFrame to subtract.
    on (str or list of str, optional): The column(s) to use as the join key(s). If not specified, the function will use all common columns.

    Returns:
    pandas.DataFrame: The resulting DataFrame after subtracting df2 from df1.
    """
    # merge the DataFrames
    result = df1.merge(df2, on=on, how='left',suffixes=('','_y'), indicator=True)

    # select only the rows in df1
    result = result[result['_merge'] == 'left_only']

    # drop the indicator column
    result = result.drop(['_merge'] + [col for col in result.columns if '_y' in col], axis=1)
                                                                        
    return result

In [4]:
# read all the required files
cons_information = pd.read_csv('https://als-hiring.s3.amazonaws.com/fake_data/2020-07-01_17%3A11%3A00/cons.csv')
cons_email = pd.read_csv('https://als-hiring.s3.amazonaws.com/fake_data/2020-07-01_17%3A11%3A00/cons_email.csv')
cons_subscription = pd.read_csv('https://als-hiring.s3.amazonaws.com/fake_data/2020-07-01_17%3A11%3A00/cons_email_chapter_subscription.csv')

In [5]:
# Check columns and datatypes
cons_information.dtypes

cons_id                         int64
prefix                         object
firstname                      object
middlename                     object
lastname                       object
suffix                         object
salutation                     object
gender                         object
birth_dt                       object
title                          object
employer                       object
occupation                     object
income                        float64
source                         object
subsource                      object
userid                          int64
password                       object
is_validated                    int64
is_banned                       int64
change_password_next_login      int64
consent_type_id                 int64
create_dt                      object
create_app                      int64
create_user                     int64
modified_dt                    object
modified_app                    int64
modified_use

In [6]:
# Check columns and datatypes
cons_email.dtypes

cons_email_id            int64
cons_id                  int64
cons_email_type_id       int64
is_primary               int64
email                   object
canonical_local_part    object
domain                  object
double_validation       object
create_dt               object
create_app               int64
create_user              int64
modified_dt             object
modified_app             int64
modified_user            int64
status                   int64
note                    object
dtype: object

In [7]:
# Check columns and datatypes
cons_subscription.dtypes

cons_email_chapter_subscription_id     int64
cons_email_id                          int64
chapter_id                             int64
isunsub                                int64
unsub_dt                              object
modified_dt                           object
dtype: object

### Question 1


In [8]:
# Taking only subset of the file that will be required
cons_information_subset = cons_information[['cons_id','source','create_dt','modified_dt']]

In [9]:
# Taking only subset of the file that will be required
cons_email_subset = cons_email[['cons_email_id','cons_id','is_primary','email']]

In [10]:
# Taking only subset of the file that will be required
cons_subscription_subset = cons_subscription[['cons_email_id','chapter_id','isunsub']]

In [11]:
cons_email_subset['is_primary'].value_counts()

0    794361
1    605639
Name: is_primary, dtype: int64

1 indicates True and 0 indicates False in all the bollean valued columns
The above statistics indicates that in the email dataset 794361 email ids are not primary and 605639 emails are primary

In [12]:
# Check if all emails from subscription is present in email data
subtract_df(cons_subscription_subset,cons_email_subset,on='cons_email_id').shape[0]

0

##### Hypothesis Check
In the above cell we check to see if there are any emails present in subscription dataset, that are not present in email dataset. 

There are no such emails

In [13]:
# Filtering only primary email ids
cons_email_primary = cons_email_subset.loc[cons_email_subset['is_primary']==1].reset_index().drop(['index'],axis=1)

Since we are only concerned with primary email address, we filter those out from our soruce data in the above cell

In [14]:
# Filttering out chapter id 1
cons_subscription_chap1 = cons_subscription_subset.loc[cons_subscription_subset['chapter_id']==1].reset_index()

In [15]:
# Eemail that has subscribed or unsubscribed to chapter 1 more than once
sum(cons_subscription_chap1['cons_email_id'].value_counts()>1)

0

##### Hypothesis Check

In the above cell we are checking to see whether there are any email ids who have subscribed and unsubscribed to chapter 1 multiple times.

There are no such emails


In [16]:
# Filtering out email ids who have subscribed to chapter 1
cons_subscription_chap1_sub = cons_subscription_subset.loc[(cons_subscription_subset['chapter_id']==1) & (cons_subscription_subset['isunsub']==0)].reset_index()

In [17]:
# Filtering out email ids who have unsubscribed to chapter 1
cons_subscription_chap1_notsub = cons_subscription_subset.loc[(cons_subscription_subset['chapter_id']==1) & (cons_subscription_subset['isunsub']==1)].reset_index()

My assummption here is that the column isunsub is a bollean column, if its 1 then it means the email is unsubscribed to the chapter; if its 0 then it means the email is subscribed to the chapter

In [18]:
# Filtering out email ids where chapter is not 1
cons_subscription_notchap1 = cons_subscription_subset.loc[cons_subscription_subset['chapter_id']!=1].reset_index()

In [19]:
cons_subscription_notchap1[cons_subscription_notchap1['cons_email_id']==448407]

Unnamed: 0,index,cons_email_id,chapter_id,isunsub
2,1772,448407,2,1
1855,52732,448407,4,1
48349,277208,448407,2,1


In [20]:
cons_subscription_chap1[cons_subscription_chap1['cons_email_id']==448407]

Unnamed: 0,index,cons_email_id,chapter_id,isunsub
946,946,448407,1,0


In the above case we notice that the same email is subscribed to chapter 1 as well as other chapters. Our motive here is consider the email ids who have subscribed to other chapters except 1 as non-subscribers to chapter 1. 

A email id can be subscriber of 2,3,4 chapters. But in this problem statement we are only concerned with Chapter 1 subscribers. So even if an email id is subscribed to 2 and 3. As far as we are concerned that particular email id is a non-subscriber of chapter 1

But if an email id is subscribed to 1,2 and 4 chapters then we consider that email a subscriber of chapter 1.

In [21]:
# Filtering out email ids that are subscribed to chapter 1 and other chapters
cons_subscription_commonchap = cons_subscription_notchap1.merge(cons_subscription_chap1_sub, on='cons_email_id',suffixes=('','_y'))
cons_subscription_commonchap = cons_subscription_commonchap.drop([col for col in cons_subscription_commonchap.columns if '_y' in col], axis=1)

In [22]:
# Filtering our email ids that are only subscribed/unsubcribed to chapters other than chapter id one
cons_subscription_notchap1.shape[0]-cons_subscription_commonchap.shape[0]

67115

In [23]:
cons_subscription_notsubchap1 = subtract_df(cons_subscription_notchap1,cons_subscription_commonchap,on='cons_email_id').reset_index().drop(['level_0'],axis=1)

From the above cell we see that there are 67115 email ids who have not subscribed to chapter 1 but they have subscribed to other chapters.

So as far as we are concerned they are non-subscribers of chapter 1

In [24]:
emails_not_sub_chap1_final = pd.concat([cons_subscription_notsubchap1,cons_subscription_chap1_notsub],axis=0)
emails_not_sub_chap1_final = emails_not_sub_chap1_final.drop_duplicates(subset=['cons_email_id'])

The final dataframe (emails_not_sub_chap1_final) is a combination of-
- Email ids who have not subscribed to the chapter id 1 (i.e where isusub=1 and chapter_id=1)
- Email ids who are subscribed to all chapter ids except 1


In [25]:
# Removing email ids that are present in constitunet_subscription_status
email_in_primary_sub_chap1 = subtract_df(cons_email_primary,cons_subscription_subset,on='cons_email_id').reset_index().drop(['is_primary','email','cons_id'], axis=1)
email_in_primary_sub_chap1['chapter_id']=1
email_in_primary_sub_chap1['isunsub']=0

- As per the given information, email ids that are not present in Constituent subscription status are by default subscribed to chapter 1

- So in the above cell we filter out email ids from constituent email addressess that are present not present in constitunet_subscription_status

- Since we do not have information about these email ids in constitunet_subscription_status we consider them subscribed to chapter 1

In [26]:
# Combining people who have subscribed to chapter 1
emails_sub_chap1_final = pd.concat([email_in_primary_sub_chap1,cons_subscription_chap1_sub],axis=0).reset_index().drop(['level_0'], axis=1)

In the above cell we have combined email ids who have subscribed to chapter 1 from constitunet_subscription_status and email ids that are not present in constitunet_subscription_status but present in constitunet_email_addressess

In [27]:
peoples_final = pd.concat([emails_sub_chap1_final,emails_not_sub_chap1_final],axis=0).reset_index().drop(['level_0'], axis=1)

In the above cell we combined people who have subscribed to chapter 1 and people who have not resulting in the peoples dataframe

In [28]:
# Joining the primary constitunet_email dataframe on cons_email_id 
people_final_primary_email = cons_email_primary.merge(peoples_final,on='cons_email_id').drop(['index','is_primary','chapter_id'],axis=1)

In the resulting dataframe we are only concerned with primary email ids. Hence we perform a inner join between cons_email_primary and peoples_final which will filter out email ids that are not primary

In [29]:
# Joining with constituent information dataframe to get the other columns
final_df =  people_final_primary_email.merge(cons_information_subset,on='cons_id').drop(['cons_email_id','cons_id'],axis=1)

In [30]:
final_df = final_df.rename(columns={'create_dt':'created_dt','modified_dt':'updated_dt','source':'code','isunsub':'is_unsub'})[['email','code','is_unsub','created_dt','updated_dt']]

In [31]:
final_df

Unnamed: 0,email,code,is_unsub,created_dt,updated_dt
0,xmartinez@vincent.com,,0,"Tue, 1997-09-30 01:41:35","Thu, 1981-02-26 19:36:22"
1,hmiller@haynes.biz,google,0,"Thu, 2014-03-27 23:18:18","Mon, 2012-12-10 18:46:32"
2,aaron64@yahoo.com,,1,"Mon, 1992-06-01 06:07:45","Mon, 1986-07-28 03:41:12"
3,wyattvincent@hotmail.com,,1,"Sun, 1993-05-23 08:00:18","Sat, 1983-05-07 09:29:18"
4,tspencer@hotmail.com,twitter,1,"Fri, 1986-10-31 03:24:05","Sat, 1979-09-22 05:01:01"
...,...,...,...,...,...
605634,smallmelvin@mitchell.com,google,0,"Fri, 1993-01-01 11:27:41","Sat, 1978-11-25 23:47:50"
605635,gardnerchristian@hotmail.com,,0,"Wed, 1979-02-07 03:11:36","Thu, 1993-02-04 21:29:28"
605636,ginanguyen@munoz.com,google,0,"Wed, 2007-05-09 20:39:24","Mon, 1993-05-10 02:58:28"
605637,tatenicole@yahoo.com,organic,0,"Sun, 1985-10-13 06:19:47","Sun, 2019-02-17 20:19:53"


In [30]:
# Saving file to current working directory
final_df.to_csv('people.csv',header=True,index=False)

### Question 2

In [32]:
# Extracting date from timestamp
final_df['acquisition_date'] = pd.to_datetime(final_df['created_dt']).dt.date

In [33]:
# Grouping by date
acquisition_facts = final_df.groupby(final_df['acquisition_date']).agg({'acquisition_date':'count'}).rename(columns={'acquisition_date': 'count'}).reset_index()

In [34]:
acquisition_facts

Unnamed: 0,acquisition_date,count
0,1970-01-01,28
1,1970-01-02,34
2,1970-01-03,27
3,1970-01-04,36
4,1970-01-05,39
...,...,...
18440,2020-06-27,36
18441,2020-06-28,33
18442,2020-06-29,24
18443,2020-06-30,40


In [34]:
# Saving file to current working directory
acquisition_facts.to_csv('acquisition_facts.csv',header=True,index=False)