In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/csv/title_studios.csv')

In [3]:
df.head()

Unnamed: 0,title,wit studio,madhouse,bones,ufotable,a-1 pictures,studio pierrot,mappa,cloverworks,comix wave,...,ekakiya,earth design works,enjin productions,sek studios,super brain,studio kingyoiro,three-d,boyan pictures,composition inc.,triopen studio
0,attack on titan,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,death note,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,my hero academia,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,demon slayer kimetsu no yaiba,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hunter x hunter 2011,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(30586, 1386)

# Drop records that don't have any studio

In [5]:
df['studio_count'] = df.sum(axis=1)

In [6]:
df[df['studio_count']>1].shape[0]

1010

In [7]:
df[df['studio_count']==0].shape[0]

14516

In [8]:
df.drop(df[df['studio_count']==0].index, inplace = True)

In [9]:
df.shape[0]

16070

In [10]:
df.drop(columns = 'studio_count', inplace = True)

# Grouping studios that have small production count into 1 category: "few_production_studio"

In [11]:
df.shape[1]

1386

In [12]:
studios = list(df.columns)
studios.remove('title')

In [13]:
few_production_studios = []
for studio in studios:
    if len(df[(df[studio] == 1)]) <= 2:
        few_production_studios.append(studio)

In [14]:
len(few_production_studios)

656

In [15]:
df['few_production_studio'] = df[few_production_studios].sum(axis = 1)

In [16]:
df.at[df[df['few_production_studio']>1].index, 'few_production_studio'] = 1

In [17]:
df.shape[1]

1387

In [18]:
df.drop(few_production_studios, axis = 1, inplace = True)

In [19]:
df.shape[1]

731

# Different names for the same studio

## Some examples:

* "production i.g., inc." vs "production i.g."
*"toei animation co., ltd." vs "toei animation "
*"animate" vs "animate film"
*"j.c. staff" vs "j.c.staff"

In [25]:
common_factors = ["co.","inc.","inc","ltd.","ltd","corp.","corp","lab.","lab","film","studio","production","animation"]

In [26]:
def preprocess_studio_names(df, common_factors):
    stripped_dict = {}
    studios = list(df.columns)
    studios.remove('title')
    for studio in studios:
        stripped = ''.join([w for w in ' '.join(studio.split(',')).split(' ') if w not in common_factors]).strip()
        if stripped not in stripped_dict.keys():
            stripped_dict[stripped] = [studio]
        else:
            stripped_dict[stripped].append(studio)
    
    grouped =[]
    count = 0
    for stripped_studio in stripped_dict.keys():
        if len(stripped_dict[stripped_studio]) >1:
            grouped.append(stripped_dict[stripped_studio])
            count += len(stripped_dict[stripped_studio])
    print(count, len(grouped))

    return grouped

In [27]:
groups = preprocess_studio_names(df, common_factors)

239 118


In [28]:
groups

[['wit studio', 'wit studio, inc.'],
 ['madhouse', 'madhouse inc.'],
 ['bones', 'bones inc.'],
 ['ufotable', 'ufotable, inc.'],
 ['a-1 pictures', 'a-1 pictures inc.'],
 ['studio pierrot', 'pierrot co., ltd.'],
 ['mappa', 'mappa co., ltd.'],
 ['kyoto animation', 'kyoto animation co., ltd.'],
 ['j.c. staff', 'j.c.staff'],
 ['toei animation', 'toei animation co., ltd.'],
 ['tms entertainment', 'tms entertainment co., ltd.'],
 ['studio deen', 'studio deen co., ltd.'],
 ['trigger', 'trigger inc.'],
 ['p.a. works', 'p.a. works co., ltd.'],
 ['sunrise', 'sunrise inc.'],
 ['david production', 'david production inc.'],
 ['gainax', 'gainax co., ltd.'],
 ['tatsunoko production', 'tatsunoko production co., ltd.'],
 ['kinema citrus', 'kinema citrus co., ltd.'],
 ['satelight', 'satelight inc.'],
 ['shaft', 'shaft inc.'],
 ['lidenfilms', 'liden films inc.'],
 ['telecom animation film', 'telecom animation film co., ltd.'],
 ['studio voln', 'studio voln co., ltd.'],
 ['bandai namco pictures', 'bandai n

In [30]:
for group in groups:
    ask_msg = ' OR '.join(group) +" ?"
    print(ask_msg)
    ind = int(input())
    ref = group.pop(ind)
    for studio in group:
        df.at[df[df[studio] == 1].index, ref] = 1
        df.drop(columns = studio, inplace = True)

wit studio OR wit studio, inc. ?
0
madhouse OR madhouse inc. ?
0
bones OR bones inc. ?
0
ufotable OR ufotable, inc. ?
0
a-1 pictures OR a-1 pictures inc. ?
0
studio pierrot OR pierrot co., ltd. ?
0
mappa OR mappa co., ltd. ?
0
kyoto animation OR kyoto animation co., ltd. ?
0
j.c. staff OR j.c.staff ?
1
toei animation OR toei animation co., ltd. ?
0
tms entertainment OR tms entertainment co., ltd. ?
0
studio deen OR studio deen co., ltd. ?
0
trigger OR trigger inc. ?
0
p.a. works OR p.a. works co., ltd. ?
0
sunrise OR sunrise inc. ?
0
david production OR david production inc. ?
0
gainax OR gainax co., ltd. ?
0
tatsunoko production OR tatsunoko production co., ltd. ?
0
kinema citrus OR kinema citrus co., ltd. ?
0
satelight OR satelight inc. ?
0
shaft OR shaft inc. ?
0
lidenfilms OR liden films inc. ?
0
telecom animation film OR telecom animation film co., ltd. ?
0
studio voln OR studio voln co., ltd. ?
0
bandai namco pictures OR bandai namco pictures inc. ?
0
tezuka productions OR tezuka

# Save preprocessed data

In [31]:
df.shape

(16070, 610)

In [32]:
df.to_csv("../data/csv/title_studios_preprocessed.csv", index=False)