#### The problem
- To identify factors that predict future user adoption based on the given datasets.
#### The approach
1. Data exploration: understand, analyze, and clean that datasets.
2. Feature engineering: Extracting relevent features and creating derived features.
3. Defining the target variable
4. Model selection
5. Training and evaluating models
6. Feature importance
#### Findings
- Interpretation and insights

In [1]:
# importing necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# importing the data

df1 = pd.read_csv('takehome_users.csv', encoding='latin1') # Data exploration

df2 = pd.read_csv('takehome_user_engagement.csv') # Target variable extraction

In [3]:
df1.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [5]:
# correcting the dtypes
df1["creation_time"] = pd.to_datetime(df1["creation_time"], format= ("%Y-%m-%d %H:%M:%S"))
df1["last_session_creation_time"] = pd.to_datetime(df1["last_session_creation_time"], unit='s')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          6417 non-null   float64       
dtypes: datetime64[ns](2), float64(1), int64(4), object(3)
memory usage: 937.6+ KB


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [7]:
# filling null values
df1['last_session_creation_time'] = df1['last_session_creation_time'].fillna(df1['creation_time'])
df1.invited_by_user_id.fillna(0, inplace=True)

df1.isna().sum()

object_id                     0
creation_time                 0
name                          0
email                         0
creation_source               0
last_session_creation_time    0
opted_in_to_mailing_list      0
enabled_for_marketing_drip    0
org_id                        0
invited_by_user_id            0
dtype: int64

Defining the target variable

In [8]:
# group 'time_stamp' by 'user_id'
df2_grouped = df2.groupby('user_id')['visited'].sum().reset_index()
df2_grouped.head()


Unnamed: 0,user_id,visited
0,1,1
1,2,14
2,3,1
3,4,1
4,5,1


In [9]:
df2_grouped = df2_grouped[df2_grouped['visited'] >= 3]

In [10]:
active_users = list(df2_grouped.user_id)

In [11]:
df2_cleaned = df2[df2['user_id'].isin(active_users)]

In [12]:
df2_cleaned.head()

Unnamed: 0,time_stamp,user_id,visited
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1


In [13]:
df2_cleaned = df2_cleaned.sort_values(by = ['user_id', 'time_stamp'])

In [14]:
df2_cleaned.drop('visited', axis=1, inplace=True)

In [15]:
df2_cleaned['shifted'] = df2_cleaned.groupby('user_id')['time_stamp'].shift(2)

In [19]:
df2_cleaned.head()

Unnamed: 0,time_stamp,user_id,shifted
1,2013-11-15 03:45:04,2,
2,2013-11-29 03:45:04,2,
3,2013-12-09 03:45:04,2,2013-11-15 03:45:04
4,2013-12-25 03:45:04,2,2013-11-29 03:45:04
5,2013-12-31 03:45:04,2,2013-12-09 03:45:04


In [None]:
df2_cleaned['span'] = (df2_cleaned['time_stamp'] - df2_cleaned['shifted']).dt.days

In [None]:
df2_cleaned.dropna(inplace=True)
df2_cleaned.head()

Unnamed: 0,time_stamp,user_id,shifted,span
3,2013-12-09 03:45:04,2,2013-11-15 03:45:04,24.0
4,2013-12-25 03:45:04,2,2013-11-29 03:45:04,26.0
5,2013-12-31 03:45:04,2,2013-12-09 03:45:04,22.0
6,2014-01-08 03:45:04,2,2013-12-25 03:45:04,14.0
7,2014-02-03 03:45:04,2,2013-12-31 03:45:04,34.0


In [None]:
df2_final = df2_cleaned[df2_cleaned['span'] <= 7].drop_duplicates(subset=['user_id'])

In [None]:
df2_final.head()

Unnamed: 0,time_stamp,user_id,shifted,span
9,2014-02-09 03:45:04,2,2014-02-03 03:45:04,6.0
24,2013-02-06 22:08:03,10,2013-01-30 22:08:03,7.0
312,2014-03-13 11:46:38,20,2014-03-11 11:46:38,2.0
331,2014-03-23 06:29:09,33,2014-03-17 06:29:09,6.0
353,2012-12-25 19:05:07,42,2012-12-18 19:05:07,7.0
