In [1]:
# importing 4 CSV files containing information about user history on an application named "Yammer"
# First we examine and clean the data
# Then recreate the user engagement analysis
# Lastly conduct some additional analyses to narrow down the cause

In [2]:
import numpy as np
import scipy.stats
import pandas as pd

In [3]:
import matplotlib
import matplotlib.pyplot as pp
from pandas.plotting import scatter_matrix
import plotly.express as px


import pandas_profiling as pp
from pandas_profiling import ProfileReport

from IPython import display
from ipywidgets import interact, widgets

In [4]:
#import the 4 files containing user and engagementment data

In [5]:
users = pd.read_csv("users.csv")
rollup = pd.read_csv("rollup.csv")
emails = pd.read_csv("emails.csv")
events = pd.read_csv("events.csv")

In [6]:
# take a quick look at the columns

In [7]:
print(users.head(3))

   user_id           created_at  company_id language         activated_at  \
0      0.0  2013-01-01 20:59:39      5737.0  english  2013-01-01 21:01:07   
1      1.0  2013-01-01 13:07:46        28.0  english                  NaN   
2      2.0  2013-01-01 10:59:05        51.0  english                  NaN   

     state  
0   active  
1  pending  
2  pending  


In [8]:
print(rollup.head(3))

   period_id              time_id            pst_start              pst_end  \
0        1.0  2013-01-01 00:00:00  2013-01-01 00:00:00  2013-01-02 00:00:00   
1        1.0  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-03 00:00:00   
2        1.0  2013-01-03 00:00:00  2013-01-03 00:00:00  2013-01-04 00:00:00   

             utc_start              utc_end  
0  2013-01-01 08:00:00  2013-01-02 08:00:00  
1  2013-01-02 08:00:00  2013-01-03 08:00:00  
2  2013-01-03 08:00:00  2013-01-04 08:00:00  


In [9]:
print(emails.head(3))

   user_id          occurred_at              action  user_type
0      0.0  2014-05-06 09:30:00  sent_weekly_digest        1.0
1      0.0  2014-05-13 09:30:00  sent_weekly_digest        1.0
2      0.0  2014-05-20 09:30:00  sent_weekly_digest        1.0


In [10]:
print(events.head(3))

   user_id          occurred_at  event_type    event_name location  \
0  10522.0  2014-05-02 11:02:39  engagement         login    Japan   
1  10522.0  2014-05-02 11:02:53  engagement     home_page    Japan   
2  10522.0  2014-05-02 11:03:28  engagement  like_message    Japan   

                   device  user_type  
0  dell inspiron notebook        3.0  
1  dell inspiron notebook        3.0  
2  dell inspiron notebook        3.0  


In [11]:
# Double check the user_id is unique in users table

In [12]:
print(len(users['user_id']), users.user_id.nunique())

19066 19066


In [13]:
# merge 3 of the files on the user_id

In [14]:
info3 = pd.merge(users, emails, on = 'user_id')

In [15]:
info3 = pd.merge(info3, events, on = 'user_id')

In [16]:
# take a look at the combined table

In [17]:
print(info3.head())

   user_id           created_at  company_id language         activated_at  \
0      4.0  2013-01-01 14:37:51      5110.0   indian  2013-01-01 14:39:05   
1      4.0  2013-01-01 14:37:51      5110.0   indian  2013-01-01 14:39:05   
2      4.0  2013-01-01 14:37:51      5110.0   indian  2013-01-01 14:39:05   
3      4.0  2013-01-01 14:37:51      5110.0   indian  2013-01-01 14:39:05   
4      4.0  2013-01-01 14:37:51      5110.0   indian  2013-01-01 14:39:05   

    state        occurred_at_x              action  user_type_x  \
0  active  2014-05-06 09:30:00  sent_weekly_digest          3.0   
1  active  2014-05-06 09:30:00  sent_weekly_digest          3.0   
2  active  2014-05-06 09:30:00  sent_weekly_digest          3.0   
3  active  2014-05-06 09:30:00  sent_weekly_digest          3.0   
4  active  2014-05-06 09:30:00  sent_weekly_digest          3.0   

         occurred_at_y  event_type           event_name location  \
0  2014-05-13 09:31:47  engagement                login    India  

In [18]:
# change the column names to keep track of which time is related to event or action

In [19]:
info3 = info3.rename(columns = {"user_id" : "user", "created_at" : "userCreated", "language" : "lang", "activated_at" : "userActivation", "state" : "state", "occurred_at_x" : "actionTime", "action" : "action", "user_type_x" : "actionUserType", "occurred_at_y" : "eventTime", "event_type" : "eventType", "event_name" : "eventName", "location" : "loc", "device" : "device", "user_type_y" : "userTypeEvent" })



In [20]:
# look at the index & info for a clean merge and get an idea of the size

In [21]:
info3.index

Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            6270657, 6270658, 6270659, 6270660, 6270661, 6270662, 6270663,
            6270664, 6270665, 6270666],
           dtype='int64', length=6270667)

In [22]:
info3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6270667 entries, 0 to 6270666
Data columns (total 15 columns):
user              float64
userCreated       object
company_id        float64
lang              object
userActivation    object
state             object
actionTime        object
action            object
actionUserType    float64
eventTime         object
eventType         object
eventName         object
loc               object
device            object
userTypeEvent     float64
dtypes: float64(4), object(11)
memory usage: 765.5+ MB


In [23]:
# using this new merged file to recreate the loss in user enagement to start
# this graph is the one that brought attention to management and what is being researched

In [25]:
info3 = info3[(info3['eventType'].str.match('engagement')) & (info3['eventName'].str.match('login'))]
info3["eventTime"] = pd.to_datetime(info3["eventTime"]).dt.week

#group by date and count unique id's
df2 = info3.groupby(['eventTime'])['user'].nunique()

#flatten the file
df2 = df2.reset_index()

#sort by date
df3 = df2.sort_values(by=['eventTime'], ascending=False)


#display
fig = px.line(df3, x='eventTime', y='user', labels={'y':'#users'})
fig.show()

AttributeError: 'Figure' object has no attribute 'close'

In [None]:
print(info3.head(1))

   user          userCreated  company_id    lang       userActivation   state  \
0   4.0  2013-01-01 14:37:51      5110.0  indian  2013-01-01 14:39:05  active   

            actionTime              action  actionUserType  eventTime  \
0  2014-05-06 09:30:00  sent_weekly_digest             3.0         20   

    eventType eventName    loc           device  userTypeEvent  
0  engagement     login  India  lenovo thinkpad            3.0  


In [None]:
import plotly.express as px
info3 = info3[(info3['eventType'].str.match('engagement')) & (info3['eventName'].str.match('login'))]
info3["eventTime"] = pd.to_datetime(info3["eventTime"]).dt.week


#count unique user id's for each company every week
df2 = info3.groupby(['eventTime', 'company_id'])['user'].nunique()

#flatten the file
df2 = df2.reset_index()

#sort by date
df = df2.sort_values(by=['eventTime'], ascending=False)

#display
fig = px.line(df, x='eventTime', y='user', color = 'company_id',labels={'y':'#users'})
fig.show()