Criteria for inclusion:
1. Must be from a music company.
2. Must have released at least 1 original song.
3. Cannot be a sub-unit, project group, or one-off group created for promotional or special stage purposes (i.e. no year end collab stages or CF song collabs).
4. Must have officially debuted. Pre-debut promotions are not considered an official debut.
5. Must be a currently active group. Groups that are not technically not disbanded, but have not had any official full group activities or social media postings within the past 2 years are not considered active. One-off reunion comebacks are not included.
    - If the activity status of the group is unknown or ambiguous, then they are still included in the data.
6. Spotify data is available for this artist.

Example groups that are included:
- XG 
- MAVE: 

Example groups that are _not_ included:
- LOONA (disbanded)
- WJMK (one-off promotional group)
- CLC (currently disbanded/disbanded/disbanded/inactive as a group)
- Girls' Generation (currently disbanded/inactive as a group, reunion comeback not included)
- WJSN CHOCOME (sub-unit)
- AZER (one-off group -- created for a university)


In [2]:
import re

In [19]:
import pickle

In [3]:
# read kprofiles girl groups
path = 'kprofiles_girl_groups_raw.txt'
f = open(path, "r")
all_girl_groups = []
for x in f:
  x = x.replace('\n', '')
  if len(x) > 1:
    all_girl_groups.append(x)

In [4]:
# how many girl groups are there initially? -- 316
len(all_girl_groups)

316

In [5]:
# remove non-kpop or specialty groups
remove_regexp = re.compile('(kids? group)|rock band|dance (group|crew)|trot (girl )?group|live idol|collab group|project (girl )?group')
idol_girl_groups = \
    [group_name for group_name in all_girl_groups if not bool(remove_regexp.search(group_name.lower()))]


In [6]:
# filter for pre-debut groups
predebut_regexp = re.compile('pre-debut')
predebut_girl_groups = \
    [group_name for group_name in all_girl_groups if bool(predebut_regexp.search(group_name.lower()))]

In [7]:
# since list of predebut groups is outdated, manually determine which groups to keep
groups_to_keep = ['ADYA (formerly known as Starting Girls) *pre-debut'
                   , 'BBGIRLS  * pre-debut'
                   , 'Bunny.T * pre-debut'
                   , 'E:LFIN  * pre-debut'
                   , 'KISS OF LIFE *pre-debut']

In [8]:
# remove pre-debut groups
idol_girl_groups = \
    [group_name for group_name in idol_girl_groups if (group_name not in predebut_girl_groups) or (group_name in groups_to_keep)]

In [9]:
# filter for sub-unit groups
subunit_regexp = re.compile('sub-unit')
subunit_girl_groups = \
    [group_name for group_name in idol_girl_groups if bool(subunit_regexp.search(group_name.lower()))]
main_girl_groups = \
    [group_name for group_name in idol_girl_groups if not bool(subunit_regexp.search(group_name.lower()))]

In [10]:
# strip white space from group names
idol_girl_groups = [group.strip() for group in main_girl_groups]

In [11]:
# LITERALLY manually went through and removed groups if they did not meet criteria for inclusion
groups_to_remove = [
    '4CARAT'  # kids group
    , '4X'  # disbanded/inactive
    , '9MUSES'  # disbanded/inactive
    , 'ACID EYES (tripleS)'  # doesn't have individual spotify page
    , 'Angel Rus'  # disbanded/inactive
    , 'AOA'  # disbanded/inactive
    , 'AR3NA (Thai-Korean group)'  # not from a K-pop company, do not promote in Korea
    , 'A-plus (2023 Group)'  # university group
    , 'Asia GirlS'  # university group
    , 'AZER (group formed by Howon University)'  # university group
    , 'B-OURS'  # pre-debut
    , 'BB Ahn'  # disbanded/inactive
    , 'Berry Chu'  # disbanded/inactive
    , 'BERRYZ'  # pre-debut
    , 'Brown Eyed Girls'  # disbanded/inactive
    , 'Celeb Five'  # disbanded/inactive
    , 'Choco2'  # pre-debut
    , 'CLC *indefinite hiatus (?)'
    , 'Crescendo'  # disbanded/inactive
    , 'CSVC'  # project group
    , 'Dal★Shabet *currently on hiatus'  # disbanded/inactive
    , 'DESTINY'  # disbanded/inactive
    , 'DIA'  # disbanded/inactive
    , 'Eternity (A.I. girl group)' # no promotions, no official training
    , 'Fanatics'  # disbanded/inactive
    , 'FERRY BLUE'  # university group
    , 'fishingirls'  # rock band
    , 'Fly With Me'  # dance group
    , 'Gavy NJ'  # ballad group
    , "Girl’s Day"  # disbanded/inactive
    , "Girls’ Generation"  # disbanded/inactive
    , "Girls Next Door"  # project group
    , 'Hey Girls'  # disbanded/inactive
    , "HeyMiss"  # disbanded/inactive
    , 'HONEY POPCORN'  # disbanded/inactive
    , 'ikling'  # no promotions, not from a K-pop company
    , 'ISEGYE IDOL'  # virtual group
    , 'Laboum'  # disbanded/inactive
    , "Ladies’ Code"  # disbanded/inactive
    , "LA FLOR"  # university group
    , "Like Me"  # disbanded/inactive
    , "Live High"  # disbanded/inactive
    , "LOONA"  # disbanded/inactive
    , "LOVElution"  # doesn't have individual spotify page
    , "Lusty"  # disbanded/inactive
    , 'MAJORS'  # disbanded/inactive
    , 'Maywish'  # disbanded/inactive
    , 'M.Diva (Opera vocal group)'  # non-idol group
    , 'MuuTive (formerly known as PBT or GKM NEO Trainees)'  # disbanded/inactive
    , 'My Darling'  # disbanded/inactive
    , 'NEKIRU'  # disbanded/inactive
    , 'NextU (Global K-pop girl group)'  # pre-debut
    , 'Oh!GG'  # disbanded/inactive
    , 'ONEST'  # university group
    , 'ON TOP'  # disbanded/inactive
    , 'Pastel Girls'  # non-idol group
    , 'Pocket Girls'  # non-kpop company, no promotions
    , 'PRECIOUS'  # disbanded/inactive
    , 'Pride Band'  # non-idol group
    , 'PRISMA' # disbanded/inactive
    , 'Pritz' # disbanded/inactive
    , 'PSEUDO SIGMA' # have not released any music
    , 'Pungdeng-E'  # disbanded/inactive
    , 'Purple Beck'  # disbanded/inactive
    , 'PURPLE RING'  # disbanded
    , "Q.O.S"  # disbanded/inactive
    , 'RAMISU'  # disbanded/inactive
    , 'Refiners'  # disbanded/inactive
    , 'Rose Finger'  # disbanded/inactive
    , 'RuViChe'  # do not have original music
    , 'S.I.S'  # disbanded/inactive
    , 'SHADOW'  # project group
    , 'Silhouette' # project group
    , 'SOYOU'  # non-idol group
    , 'SRZN'  # not from a k-pop company
    , 'Sunny Hill'  # disbanded/inactive
    , 'Ssambak Girls (Formed from the Season 2 Project from Mr. Yoon)'  # project group
    , 'T-Ara'  # disbanded/inactive
    , "The Grace"  # disbanded/inactive
    , "Tier 4"  # not from a k-pop company
    , "TRACER"  # pre-debut
    , 'UiU'  # university group
    , 'UPTOYOU Final Lineup'  # disbanded/inactive
    , 'Vanilla'  # project group
    , 'VENUS'  # trot group
    , 'Violet'  # disbanded/inactive
    , 'Walking After U'  # non-idol group
    , "XoX"  # not from a k-pop company, no k-pop training
    , 'WM Entertainment New Girl Group'  # pre-debut
    , 'Rose Queen'  # does not have original music
    , 'BLE'  # kids group
    , 'Blossom'  # doesn't have an individual spotify page
    , "Bright Day"  # disbanded/inactive 
    , 'Cherish'  # project group
    , 'EXID'  # inactive
    , 'Girls’ Alert'  # disbanded/inactive
    , 'Half Moon'  # doesn't have an individual spotify page
    , 'ICE CREAM'  # project group
    , 'K/DA'  # disbanded/inactive
    , 'MZ SooChaeHak'  # does not promote, does not have K-pop training
    , 'Bluewish'  # no promotions, no k-pop training
]

In [12]:
# get final list of eligible girl groups
eligible_girl_groups = [group_name for group_name in idol_girl_groups if (group_name not in groups_to_remove)]

In [13]:
# what is the final number of eligible girl groups?
len(eligible_girl_groups)

91

In [14]:
# clean final list of group names
## remove extra text
eligible_girl_groups = [group_name.split(' (', 1)[0] for group_name in eligible_girl_groups]

## remove pre-debut text
predebut_regexp = re.compile('(.*?)(( )+\*( )?pre-debut)$')
eligible_girl_groups = \
    [re.search(predebut_regexp, group_name).group(1) if re.search(predebut_regexp, group_name) 
        else group_name for group_name in eligible_girl_groups]

In [17]:
# replace Cosmic Girls with WJSN
eligible_girl_groups.remove('Cosmic Girls')
eligible_girl_groups.append('WJSN')

In [18]:
# final clean list
eligible_girl_groups

['3YE',
 'ADYA',
 'aespa',
 'ALICE',
 'Apink',
 'ARTBEAT',
 'ARTMS',
 'Asome.D',
 'BADKIZ',
 'BBGIRLS',
 'Beauty Box',
 'Billlie',
 'BLACKPINK',
 'Black Swan',
 'Bunny.T',
 'Busters',
 'Cherry Bullet',
 'Chic & Idle',
 'Chic Angel',
 'CIGNATURE',
 'CLASS:y',
 'CRAXY',
 'CSR',
 'DIAWINGS',
 'Dreamcatcher',
 'Dreamnote',
 'E:LFIN',
 'Episode',
 'EVERGLOW',
 'Favorite',
 'FIFTY FIFTY',
 'FLORIA',
 'fromis_9',
 '(G)I-DLE',
 'GIRL CRUSH',
 'Girls’ World',
 'Girls On Top',
 'H1-KEY',
 'HANA',
 'Hashtag',
 'Hi-L',
 'ICHILLIN’',
 'ICU',
 'ILY:1',
 'IRRIS',
 'ITZY',
 'IVE',
 'Kep1er',
 'KISS OF LIFE',
 'LAPILLUS',
 'LAYSHA',
 'LE SSERAFIM',
 'LIGHTSUM',
 'LIMELIGHT',
 'Maka Maka',
 'MAMAMOO',
 'MAVE:',
 'mimiirose',
 'Nature',
 'NEWJEANS',
 'NMIXX',
 'OH MY GIRL',
 'PEACH A!',
 'Pink Fantasy',
 'PIXY',
 'Posh Girls',
 'PRIMROSE',
 'PRITTI-G',
 'Purple Kiss',
 'Q6IX',
 'Queenz Eye',
 'Red Mint',
 'Red Velvet',
 'Rocket Punch',
 'Rocking Doll',
 'Rumble.G',
 'Saturday',
 'Secret Number',
 'SKYLE'

In [22]:
# pickle object
with open('eligible_girl_groups', 'wb') as fp:
    pickle.dump(eligible_girl_groups, fp)