# Character data visualisation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import matplotlib.ticker as ticker

In [None]:
df = pd.read_csv("./datasets/characters.csv")
date_format = "%Y-%m-%d"

In [None]:
def label_age(row):
    if row["body"] == "loli":
        return "child"
    if row["body"] == "boy" or row["body"] == "girl" or row["body"] == "boy/girl":
        return "teenage"
    if row["body"] == "male" or row["body"] == "lady":
        return "adult"

df["age"] = df.apply(lambda row: label_age(row), axis=1)

# Used for stretching all categories to the right
max_date = dt.datetime.strptime(df["release"].max(), date_format)

## Simple visualisation

Visualising simple distributions here. Pie charts on the left show how characters are currently distributed within different categories (or how they were the last time when the dataset was updated). Graphs on the right show how the number of characters in each category changed over time.

In [None]:
def simple_visualisation(column):
    fig, ax = plt.subplots(1, 2, figsize=(12, 5), gridspec_kw={"width_ratios": [3, 5]})
    fig.suptitle(f"Playable character distribution by {column}", fontsize=16)
    ax[0].set_title("Current", fontweight="bold", pad=20)
    ax[1].set_title("Historical", fontweight="bold", pad=20)
    
    # Only display whole values on the step graph y axis
    ax[1].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))

    group_sizes = df.groupby(column, as_index=False).size()

    ax[0].pie(group_sizes["size"], labels=group_sizes[column], autopct="%1.1f%%")

    for column_val in group_sizes[column]:
        column_val_by_date = df[getattr(df, column) == column_val].groupby("release").size()

        for date in reversed(column_val_by_date.axes[0]):
            column_val_by_date[date] = column_val_by_date[column_val_by_date.axes[0] <= date].values.sum()

        x_axis = list(map(dt.datetime.strptime, column_val_by_date.axes[0], len(column_val_by_date.axes[0]) * [date_format]))
        y_axis = column_val_by_date.values.tolist()

        # This should stretch all categories to the right
        if (x_axis[len(x_axis) - 1] < max_date):
            x_axis.append(max_date)
            y_axis.append(y_axis[len(y_axis) - 1])

        ax[1].step(x_axis, y_axis, label=column_val, marker="o", markersize=4, where="post")

    fig.tight_layout()
    plt.show()

    print(group_sizes)

### Categorised by sex
Since traveller's sex is selected by the player, he or she is listed in a separate category. You don't say, there are more female characters than male characters.

In [None]:
simple_visualisation("sex")

## Categorised by age

It seems like the percentage of adult characters has risen over time, while the percentage of children got down.

In [None]:
simple_visualisation("age")

### Categorised by body type
All playable characters in Genshin Impact use one of the five body templates with custom clothes and hair added on top. Those body templates are (as named in game files):

|Body type|Description    |Example characters     |
|---------|---------------|-----------------------|
|Boy      |Teenage male   |Albedo, Gorou, Kazuha  |
|Male     |Young male     |Zhongli, Itto, Diluc   |
|Loli     |Child female   |Klee, Sayu, Diona      |
|Girl     |Teenage female |Sucrose, Ayaka, Kokomi |
|Lady     |Young female   |Jean, Shogun, Sara     |

Since traveller's sex is chosen by the player, he or she is listed in a separate category.
Girl body type is the most popular here.

In [None]:
simple_visualisation("body")

### Categorised by wielded weapon

Polearms were the least popular weapon initially, but they are catching up now. Bows, on the other hand, jumped from the second least popular weapon to the position of the most popular weapon. Why is everyone calling it "Polearm Impact" when it's in fact "Bow Impact"?

In [None]:
simple_visualisation("weapon")

### Categorised by home region
Traveller and Aloy do not have a home region within Teyvat, so they are listed in a separate category. Inazuma rises at a really fast pace.

In [None]:
simple_visualisation("region")

### Categorised by vision element
Traveller technically doesn't have a vision, and his or her element can be switched, so there's a separate category for him or her.

In [None]:
simple_visualisation("vision")

### Categorised by quality
The number of ☆☆☆☆☆ characters vs the number of ☆☆☆☆ characters. There was a good balance at the start, but they mihoyo'd it up. 

In [None]:
simple_visualisation("quality")

## Divided visualisation

Comparing category distribution between different categories. It's hard to explain using words, luckily, I don't have to.

In [None]:
def divided_visualisation(column, divided_by):
    divisions = df[divided_by].unique()
    
    if len(divisions) <= 3:
        nrows = 1
        ncols = len(divisions)
    else:
        ncols = 3
        nrows = int(len(divisions) / ncols) + 1
        
    fig, ax = plt.subplots(nrows, ncols, figsize=(6 * ncols, 5 * nrows))
    
    if nrows == 1:    
        for i, division in enumerate(divisions):
            ax[i].set_title(division, fontweight="bold", pad=20)
            group_sizes = df[getattr(df, divided_by) == division].groupby(column, as_index=False).size()
            ax[i].pie(group_sizes["size"], labels=group_sizes[column], autopct="%1.1f%%")
    else:
        for x, division in enumerate(divisions):
            i = int(x / ncols)
            j = x % ncols
            
            ax[i, j].set_title(division, fontweight="bold", pad=20)
            group_sizes = df[getattr(df, divided_by) == division].groupby(column, as_index=False).size()
            ax[i, j].pie(group_sizes["size"], labels=group_sizes[column], autopct="%1.1f%%")
            
        j += 1
        while j < ncols:
            ax[i, j].set_visible(False)
            j += 1

    fig.suptitle(f"Playable character distribution by {column} divided by {divided_by}", fontsize=16)
    fig.tight_layout()
    plt.show()

### Categorised by wielded weapon, compared between sexes
Interesting how there are no male catalyst users at all, while it's the most popular weapon among female characters. At the same time, swords and claymores are much more popular among male characters.

In [None]:
divided_visualisation("weapon", "sex")

### Categorised by wielded weapon, compared between ages
Looks like bows and swords are more popular among teenage characters, but claymores and polearms are more popular among adult characters.

In [None]:
divided_visualisation("weapon", "age")

### Categorised by age, compared between sexes
The percentage of teenage male and teenage female characters is almost the same. And also, there are no male children.

In [None]:
divided_visualisation("age", "sex")

### Categorised by vision, compared between sexes
Geo and anemo are much more popular among male characters, while cryo and electro are much more popular among female characters.

In [None]:
divided_visualisation("vision", "sex")

### Categorised by age, compared between vision elements
Looks like pyro and anemo are the least "adult" elements while geo and electro are the most "adult" elements.

In [None]:
divided_visualisation("age", "vision")

### Categorised by vision, compared between wielded weapons
Looks like the most iconic duos are: hydro with a catalyst and pyro with a polearm.

In [None]:
divided_visualisation("vision", "weapon")

### Categorised by wielded weapon, compared between regions
Looks like the following regions have such signature weapons: catalyst in Mondstadt, bow is Inazuma and polearm in Liyue.

In [None]:
divided_visualisation("weapon", "region")

### Categorised by home region, compared between quality
Half of all four-star charactes are from Mondstadt, and five-star characters are equally distributed between released regions.

In [None]:
divided_visualisation("region", "quality")

### Categorised by quality, compared between vision elements
Electro visions have the largest percentage of four-star characters. Shame on you, Baal.
At the same time, anemo characters are mostly five-star.

In [None]:
divided_visualisation("quality", "vision")

## Misc

Just having a ball here. Here's a graph showing how the average name length of playable characters changed over time. Since traveller's name is chosen by the player, he or she isn't taken into account.

Typical character name after Snezhnaya comes out: "Petrov Rostislav Yakovlevich".

In [None]:
avg_name_length_per_date = []

for date in df.sort_values("release", ascending=False)["release"].unique():
    avg_name_length_per_date.append((date, df[df.release <= date]["name"][df.name != "traveller"].apply(len).mean()))

avg_name_length_per_date.reverse()
avg_name_length_per_date = np.array(avg_name_length_per_date)
x_axis = list(map(dt.datetime.strptime, avg_name_length_per_date[:,0], avg_name_length_per_date.shape[0] * ["%Y-%m-%d"]))
y_axis = list(map(float, avg_name_length_per_date[:,1]))

f = plt.figure()
f.set_figwidth(8)
f.set_figheight(4)

plt.step(x_axis, y_axis, marker="o", markersize=4, where="post")
plt.title("Average playable character name length")
plt.show()