Fetches recommendation feeds from mixpanel and generates feed items in `raw_data/mixpanel_feeds.jsonl`

In [3]:
import getpass, requests

account_name = 'dec24.04fb65.mp-service-account'
account_password = getpass.getpass()
project_id = '2705439'


In [4]:
# Get the data from the API
import datetime, json

# format today as YYYY-MM-DD
to_date = datetime.datetime.now().strftime('%Y-%m-%d')
json_lines = None

# if fetch:
response = requests.get(
    'https://data.mixpanel.com/api/2.0/export/',
    auth=(account_name, account_password),
    params={
        'project_id': project_id,
        'from_date': '2020-01-01',
        'to_date': to_date,
        'event': '["recommendation"]',
        'format': 'json'
    }
)
# print status code, then print error if not 200
print(response.status_code)
if response.status_code != 200:
    print(response.text)
if response.status_code == 200:
    json_lines = response.text.split('\n')
    # write the data to a file
    with open('./raw_data/mixpanel_raw_events.jsonl', 'w') as f:
        f.write(response.text)




200


In [5]:
events = [json.loads(line) for line in response.text.split('\n') if line]
print('Fetched', len(events), 'events from mixpanel')

Fetched 23760 events from mixpanel


In [6]:
from feed_types import Feed, assign_proper_id
from typing import TypedDict, Optional, List, Set
print(events[0])

fields_we_care_about = set([
    'countInBatch',
    'feedURL',
    'source',
    'subreddit',
    'title',
    'youtubeId',
    '$city',
    '$country',
    '$region',
])

# Sample:
# {'event': 'recommendation', 'properties': {'time': 1711927710, 'distinct_id': 'C1AE1F64-77F2-4394-8323-E2207C0563AD', '$app_build_number': '21', '$app_version_string': '5', '$carrier': '--', '$city': 'Brasília', '$device_id': 'C1AE1F64-77F2-4394-8323-E2207C0563AD', '$had_persisted_distinct_id': True, '$insert_id': '002e372cf9d0981e', '$lib_version': '3.2.3', '$manufacturer': 'Apple', '$model': 'iPhone14,3', '$mp_api_endpoint': 'api.mixpanel.com', '$mp_api_timestamp_ms': 1711942149621, '$os': 'iOS', '$os_version': '17.4.1', '$radio': 'LTE', '$region': 'Federal District', '$screen_height': 926, '$screen_width': 428, '$wifi': True, 'countInBatch': 4, 'curRoute': 'shareFavorites', 'environment': 'production', 'feedURL': 'https://9to5mac.com/feed/', 'fromSurvey': True, 'mp_country_code': 'BR', 'mp_lib': 'swift', 'mp_processing_time_ms': 1711942149660, 'prevRoute': 'homeFeed', 'source': 'feed', 'subreddit': '', 'title': '9to5Mac', 'youtubeId': ''}}
def filter_event(event):
    props = event['properties']
    return {k: v for k, v in props.items() if k in fields_we_care_about}

filtered_events = [filter_event(event) for event in events]

aggregated = [
    {
        'score': 1.0 / event['countInBatch'],
        'feedURL': event['feedURL'],
        'source': event['source'],
        'subreddit': event['subreddit'],
        'title': event['title'],
        'youtubeId': event['youtubeId'],
    }
    for event in filtered_events
]

# then, combine multiple cells with the same values for everything except score
from collections import defaultdict

aggregated_dict = defaultdict(list)
for event in aggregated:
    key = (event['feedURL'], event['source'], event['subreddit'], event['title'], event['youtubeId'])
    aggregated_dict[key].append(event['score'])

# then, sum the scores
aggregated = [
    {
        'score': sum(scores),
        'feedURL': feedURL,
        'source': source,
        'subreddit': subreddit,
        'title': title,
        'youtubeId': youtubeId,
    }
    for (feedURL, source, subreddit, title, youtubeId), scores in aggregated_dict.items()
]

def dict_to_feed(d) -> Optional[Feed]:
    f = {
        'title': d['title'],
        'popularity_score': d['score'],
        'sources': ['crowdsourced_popular'],
    }
    for k in list(d.keys()):
        if d[k] == '':
            del d[k]
    if 'feedURL' in d:
        f['feed_url'] = d['feedURL']
        f['kind'] = 'feed'
    elif 'subreddit' in d:
        f['subreddit'] = d['subreddit']
        f['kind'] = 'reddit'
    elif 'youtubeId' in d:
        f['channel_id'] = d['youtubeId']
        f['kind'] = 'youtube'
    else:
        print('unknown feed type {}'.format(d['source']))
        return None
        # assert False, 'unknown feed type {}'.format(d)
    assign_proper_id(f)
    return f
    

aggregated = [dict_to_feed(f) for f in aggregated]
aggregated = [a for a in aggregated if a is not None]
# Print top 10:
aggregated.sort(key=lambda x: x['popularity_score'], reverse=True)

# keep all with score >= 2
high_scored = [a for a in aggregated if a['popularity_score'] >= 2.0]
print("Of", len(aggregated), 'feeds,', len(high_scored), 'had score >= 2.0 and will be kept')
# for s in high_scored:
#     print(s)

path = '../raw_data/mixpanel.feeds.jsonl'
with open(path, 'w') as f:
    for feed in high_scored:
        f.write(json.dumps(feed) + '\n')
print('Wrote', len(high_scored), 'feeds to', path)


{'event': 'recommendation', 'properties': {'time': 1739885086, 'distinct_id': 'AB9F9B04-E991-4B26-81D5-48599D454751', '$app_build_number': '23', '$app_version_string': '16', '$carrier': '--', '$city': 'Denver', '$device_id': 'AB9F9B04-E991-4B26-81D5-48599D454751', '$had_persisted_distinct_id': True, '$insert_id': '063bb64e334199b2', '$lib_version': '3.2.3', '$manufacturer': 'Apple', '$model': 'iPhone16,1', '$mp_api_endpoint': 'api.mixpanel.com', '$mp_api_timestamp_ms': 1739903145958, '$os': 'iOS', '$os_version': '18.3.1', '$radio': 'NRNSA', '$region': 'Colorado', '$screen_height': 852, '$screen_width': 393, '$wifi': True, 'countInBatch': 2, 'curRoute': 'shareFavorites', 'environment': 'production', 'feedURL': 'http://feeds.feedburner.com/tommacwright/odWX', 'fromSurvey': True, 'mp_country_code': 'US', 'mp_lib': 'swift', 'mp_processing_time_ms': 1739903146014, 'prevRoute': 'homeFeed', 'source': 'feed', 'subreddit': '', 'title': 'Tom MacWright', 'youtubeId': ''}}
unknown feed type weathe