# Relax Inc. Take Home Challenge

The goal is to look at take-home users and see who might be most likely to be a future adopted user.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
#Lets start with just loading the data and taking a look

user_data = pd.read_csv('./user_data/takehome_users.csv', encoding = 'latin-1')
user_engagement = pd.read_csv('./user_data/takehome_user_engagement.csv')

In [12]:
user_data.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,4/22/2014 3:53,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,11/15/2013 3:45,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,3/19/2013 23:14,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,5/21/2013 8:09,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,1/17/2013 10:14,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [13]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [18]:
# drop name and email because we don't need that information

user_data.drop(['name','email'], inplace = True, axis = 1)

KeyError: "['name' 'email'] not found in axis"

In [19]:
user_data

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,4/22/2014 3:53,GUEST_INVITE,1.398139e+09,1,0,11,10803.0
1,2,11/15/2013 3:45,ORG_INVITE,1.396238e+09,0,0,1,316.0
2,3,3/19/2013 23:14,ORG_INVITE,1.363735e+09,0,0,94,1525.0
3,4,5/21/2013 8:09,GUEST_INVITE,1.369210e+09,0,0,1,5151.0
4,5,1/17/2013 10:14,GUEST_INVITE,1.358850e+09,0,0,193,5240.0
...,...,...,...,...,...,...,...,...
11995,11996,9/6/2013 6:14,ORG_INVITE,1.378448e+09,0,0,89,8263.0
11996,11997,1/10/2013 18:28,SIGNUP_GOOGLE_AUTH,1.358275e+09,0,0,200,
11997,11998,4/27/2014 12:45,GUEST_INVITE,1.398603e+09,1,1,83,8074.0
11998,11999,5/31/2012 11:55,PERSONAL_PROJECTS,1.338638e+09,0,0,6,


I think a good approach would be to seperate out the data into two groups. Ones that are our adopted user and one group that isn't. We'll use the "takehome_user_engagement" table to look at that.

In [21]:
# Group by the users to find out which have counts of greater than 3

user_visits = user_engagement.groupby(by = 'user_id').sum()
user_visits

Unnamed: 0_level_0,visited
user_id,Unnamed: 1_level_1
1,1
2,14
3,1
4,1
5,1
...,...
11996,1
11997,1
11998,1
11999,1


In [44]:
# break out into adopted
adopted = user_visits[user_visits['visited'] >= 3]
not_adopted = user_visits[user_visits['visited'] < 3]

In [47]:
not_adopted = not_adopted.reset_index()
not_adopted.head()

Unnamed: 0,user_id,visited
0,1,1
1,3,1
2,4,1
3,5,1
4,6,1


In [45]:
adopted = adopted.reset_index()
adopted.head()

Now that we have 2 groups we can filter our dataframe based on these 2 groups and assess their differences.

In [50]:
adopted_df = user_data[user_data['object_id'].isin(adopted['user_id'])]
adopted_df.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
1,2,11/15/2013 3:45,ORG_INVITE,1396238000.0,0,0,1,316.0
9,10,1/16/2013 22:08,ORG_INVITE,1401833000.0,1,1,318,4143.0
19,20,3/6/2014 11:46,SIGNUP,1401364000.0,0,0,58,
32,33,3/11/2014 6:29,GUEST_INVITE,1401518000.0,0,0,401,79.0
41,42,11/11/2012 19:05,SIGNUP,1401045000.0,1,0,235,


In [51]:
not_adopted_df = user_data[user_data['object_id'].isin(not_adopted['user_id'])]
not_adopted_df.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,4/22/2014 3:53,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,3,3/19/2013 23:14,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,5/21/2013 8:09,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,1/17/2013 10:14,GUEST_INVITE,1358850000.0,0,0,193,5240.0
5,6,12/17/2013 3:37,GUEST_INVITE,1387424000.0,0,0,197,11241.0
