Skip to content

Commit

Permalink
Merge pull request #2454 from mythmon/cohort-analysis-1131143
Browse files Browse the repository at this point in the history
[Bug 1131143] Add cohort analysis script.
  • Loading branch information
rlr committed Apr 7, 2015
2 parents 302ceeb + 6c19b90 commit ab0615b
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 2 deletions.
7 changes: 5 additions & 2 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ django-debug-toolbar==1.2.2
# sha256: Y6KG6ssDzV9kPX60WMbGK3DBEbRjT9jVnyfMrKS3yDo
django-sslserver==0.14

# sha256: x9txeBCraWX2bIzwOYqYydjfmC2jm0zX8WKRHriVlvo
docutils==0.12

# sha256: KF6L1zDAtv374jwy0pNr_7pAHyPKsTLocixovoDW8YI
flake8==2.2.5

Expand All @@ -33,5 +36,5 @@ q==2.4
# sha256: lJM7ZOL-CAfaBhLFdKAhwNrCjHvTxKI3I65aOeqPPQQ
Sphinx==1.2.3

# sha256: x9txeBCraWX2bIzwOYqYydjfmC2jm0zX8WKRHriVlvo
docutils==0.12
# sha256: kHGqy9l6mpFQlsGq8NxoSsJnKQTNh221kECF1trJgQ4
tabulate==0.7.5
84 changes: 84 additions & 0 deletions scripts/cohort_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Groups users into monthly cohorts, and analyzes drop off rate for each group.
Run this script like `./manage.py runscript cohort_analysis`.
"""
from datetime import datetime, timedelta
from traceback import print_exc

from tabulate import tabulate

from kitsune.questions.models import Answer
from kitsune.wiki.models import Revision


def run():
try:
run_()
except Exception:
print_exc()
raise


def run_():
now = datetime.now()
boundaries = [datetime(now.year, now.month, 1)]
for _ in range(12):
first_day_of_previous_month = (boundaries[-1] - timedelta(days=1)).replace(day=1)
boundaries.append(first_day_of_previous_month)
boundaries.reverse()
ranges = zip(boundaries[:-1], boundaries[1:])

reports = [
('L10n', Revision.objects.exclude(document__locale='en-US')),
('KB', Revision.objects.filter(document__locale='en-US')),
('Questions', Answer.objects.all())
]

for title, queryset in reports:
data = report_for(queryset, ranges)
headers = [title] + [s.strftime('%b') for s, _ in ranges]
print(tabulate(data, headers=headers))
print


def count_contributors_in_range(queryset, users, date_range):
"""Of the group ``users``, count how many made a contribution in ``date_range``."""
start, end = date_range
users = set(o.creator for o in
queryset.filter(creator__in=users, created__gte=start, created__lt=end))
return len(users)


def get_cohort(queryset, date_range):
start, end = date_range
contributions_in_range = queryset.filter(created__gte=start, created__lt=end)
potential_users = set(cont.creator for cont in contributions_in_range)

def is_in_cohort(u):
first_contrib = queryset.filter(creator=u).order_by('id')[0]
return start <= first_contrib.created < end

return filter(is_in_cohort, potential_users)


def report_for(queryset, ranges):
for i, cohort_range in enumerate(ranges):
cohort_users = get_cohort(queryset, cohort_range)
start, end = cohort_range
data = []

data.append(start.strftime('%b %Y'))
# Fill months before the cohort started
for _ in range(i):
data.append(None)
data.append(len(cohort_users))

for return_range in ranges[i + 1:]:
returned = count_contributors_in_range(queryset, cohort_users, return_range)
data.append(returned)

yield data

0 comments on commit ab0615b

Please sign in to comment.