In [48]:
from math import floor

import wmfdata as wmf
from wmfdata.utils import pd_display_all

# We assess activity by looking at the number of Hive partitions from the most recent month
MOST_RECENT_MO = 5

In [37]:
eventlogging_tables = wmf.spark.run("SHOW TABLES IN event").query(
  # Edit table is broken somehow, and inactive anyway
  "tableName != 'edit' &"
  # These tables come from the Modern Event Platform
    "~tableName.str.contains('mediawiki_') &"
    "~tableName.str.contains('eventgate_') &"
    "tableName not in ('resource_change')"
).reset_index(drop=True)

In [41]:
def get_partition_count(table_name):
  partitions_query = "SHOW PARTITIONS event.{} PARTITION(month='{}')".format(table_name, MOST_RECENT_MO)
  partitions = wmf.spark.run(partitions_query)
  return len(partitions)

eventlogging_tables["last_mo_partitions"] = eventlogging_tables["tableName"].apply(get_partition_count)

In [49]:
max_partitions = eventlogging_tables["last_mo_partitions"].max()
active_threshold = floor(max_partitions / 2)

active_tables = eventlogging_tables.query("last_mo_partitions >= @active_threshold")

These should all be included in the list of active schemas.

In [50]:
active_tables["tableName"].pipe(pd_display_all)

0      advancedsearchrequest               
2      centralauth                         
3      centralnoticebannerhistory          
4      centralnoticeimpression             
5      centralnoticetiming                 
7      changeslistfiltergrouping           
9      changeslisthighlights               
13     contenttranslation                  
14     contenttranslationabusefilter       
15     contenttranslationcta               
16     contenttranslationerror             
17     contenttranslationsuggestion        
18     cpubenchmark                        
19     echointeraction                     
20     echomail                            
21     editattemptstep                     
22     editconflict                        
23     editoractivation                    
24     editorjourney                       
28     eventerror                          
30     externalguidance                    
33     firstinputtiming                    
34     flowreplies              

Now, manually assess any schemas listed as active but not found in the list above. If they're truly inactive, they should be removed from the list, but keep in mind they also might represent rare events or have recently been implemented.

The key question is whether the schema is used by the latest version of whatever codebase it applies to.

One good way to check this is to search for the schema name with the [MediaWiki code search tool](https://codesearch.wmflabs.org). However, note the search doesn't cover the mobile apps and some other codebases that aren't part of MediaWiki.

In [52]:
extra_active_tables = [
  "guidedtourinternallinkactivation",
  "mobilewikiapplangselect",
  "mobilewikiappofflinelibrary",
  "mobilewikiapponboarding",
  "mobilewikiappwidgets",
  "translationrecommendationuirequests",
  "translationrecommendationuseraction",
  "wikimediablogvisit"
]

eventlogging_tables.query("tableName in @extra_active_tables")

Unnamed: 0,database,tableName,isTemporary,last_mo_partitions
42,event,guidedtourinternallinkactivation,False,78
86,event,mobilewikiapplangselect,False,66
95,event,mobilewikiappofflinelibrary,False,3
96,event,mobilewikiapponboarding,False,45
110,event,mobilewikiappwidgets,False,51
144,event,translationrecommendationuirequests,False,72
145,event,translationrecommendationuseraction,False,20
163,event,wikimediablogvisit,False,14
