In [None]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import math

pd.set_option('max_columns', 50)
mpl.rcParams['lines.linewidth'] = 2

%matplotlib inline
filename='out.tsv'
df = pd.read_csv(filename,sep='\t')
print('The file '+filename +' has '+ str(len(df)) +' rows.')
df.head()

In [None]:
print('Removing duplicates...')
df = df.drop_duplicates()
df=df[pd.to_numeric(df['cnt'], errors='coerce').notnull()]
print('It has now ' + str(len(df)) + ' rows.')

In [None]:
print('Reformatting the dates as in YYYY-mm...')
df['activated'] = pd.to_datetime(df['activated'].astype(str),errors='coerce',format='%b-%y').dt.strftime('%Y-%m')
df.rename(columns={'current_month': 'month'}, inplace=True)
df['month'] = pd.to_datetime(df['month'].astype(str),errors='coerce',format='%b-%y').dt.strftime('%Y-%m')
df.head()

In [None]:
grouped01 = df.groupby(['activated'])
# count the unique users per cohort
SizePerCohort = grouped01.agg({'user_id':pd.Series.nunique})
SizePerCohort.rename(columns={'user_id': 'Size'}, inplace=True)
SizePerCohort=SizePerCohort['Size'].groupby(level=0).first()
SizePerCohort.head()
#len(SizePerCohort)

In [None]:
#df.dtypes

In [None]:
df['cnt']=df['cnt'].astype(int)

In [None]:
grouped = df.groupby(['activated', 'month'])
# count the unique users, orders, and total activities per Group 
cohorts = grouped.agg({'user_id':pd.Series.nunique,
                      'cnt':np.sum})
cohorts.rename(columns={'user_id': 'TotalUsers'}, inplace=True)
cohorts['Ave Act/User']=round(cohorts.cnt/cohorts.TotalUsers)
# reindex the DataFrame
#cohorts.reset_index(inplace=True)
cohorts.head(12)

In [None]:
cohorts['TotalUsers'].unstack(0).head(12)

In [None]:
user_retention = cohorts['TotalUsers'].unstack(0).divide(SizePerCohort, axis=1)
user_retention.head(17)

In [None]:
retention_ave=[]
retention_ave=user_retention.iloc[:,range(0,2)].mean(axis=1)
# retention_ave=user_retention.iloc[:,[0]].mean(axis=1)#.transpose()
for i in range(1,9):
    retention_ave2=user_retention.iloc[:,range(4*i-2,4*i+2)].mean(axis=1)
    retention_ave=pd.concat([retention_ave, retention_ave2.rename('Q'+str(i))], axis=1)
retention_ave.iloc[:,0:7].plot(figsize=(10,5))
plt.title('Cohorts: User Retention [averaged over trimesterly cohorts]')
plt.ylim(0, 1.1)
plt.ylabel('% of Cohort Purchasing')
print(retention_ave)

In [None]:
retention_ave3=[]
retention_ave3=user_retention.iloc[:,range(0,24)].mean(axis=1)
retention_ave3.plot(figsize=(10,5))
plt.title('Cohorts: Average User Retention')
plt.ylim(0, 1.1)
plt.ylabel('% of Cohort Purchasing')

In [None]:
user_activity=cohorts['Ave Act/User'].unstack(0)
user_activity.head()

In [None]:
activity_ave3=[]
activity_ave3=user_activity.iloc[:,range(0,24)].mean(axis=1)#.transpose()

activity_ave3.plot(figsize=(10,5))
plt.title('Cohorts: Average Activity')
plt.ylim(0, 1000)
plt.ylabel('Average activities')

In [None]:
activity_ave=[]
activity_ave=user_activity.iloc[:,range(0,2)].mean(axis=1)#.transpose()
for i in range(1,9):
    activity_ave2=user_activity.iloc[:,range(4*i-2,4*i+2)].mean(axis=1)
    activity_ave=pd.concat([activity_ave, activity_ave2.rename('Q'+str(i))], axis=1)
activity_ave.iloc[:,0:7].plot(figsize=(10,5))
plt.title('Cohorts: Activity averaged over trimesterly cohorts')
#plt.ylim(0, 1.1)
plt.ylabel('Average activities')