# Gender voice recognition - eksploracja danych
Michał Piasecki, Bartosz Siński 

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df_voice = pd.read_csv("./src/gender_voice_dataset.csv")
df_attr = pd.read_csv("./src/attributes_gender_voice.csv")

# Podstawowe informacje 

In [None]:
pd.options.display.max_colwidth = 200
df_attr

In [None]:
df_voice.info()

Jak widzimy, nasz zbiór nie zawiera brakujących danych.

In [None]:
df_voice.describe()

In [None]:
df_voice.drop(["label"], axis=1).hist(bins = 40, figsize=(18, 12))
plt.show()

In [None]:
voice_grouped = df_voice.groupby(by="label")
voice_grouped['meanfreq'].count()

In [None]:
df_voice=pd.get_dummies(df_voice)

# Korelacje i zależności zmiennych 

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df_voice.corr())
plt.show()

Przyjrzyjmy się bliżej *meanfreq,median,Q25,Q75,mode,centroid*, które wydają się byc ze soba najlepiej skorelowane. 

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df_voice[['meanfreq','median','Q25','Q75','mode','centroid']].corr(),annot=True, annot_kws={'size': 10}, fmt='.2f')
plt.show()

In [None]:
sns.set()
sns.pairplot(df_voice[['meanfreq','median','Q25','Q75','mode','centroid']], height = 5)
plt.show();

Już teraz widzimy, że niektórych zmiennych będziemy mogli nie uwzględniać przy budowie naszego modelu. 

# Zmienne najlepiej skorelowane z targetem

In [None]:
voice_corr = df_voice.corr()[['label_male','label_female']]
voice_corr.iloc[(-voice_corr['label_male'].abs()).argsort()]

In [None]:
fig, axs = plt.subplots(nrows=5,figsize=(5,20))
fig.tight_layout(pad=3.0)
sns.boxplot(data=df_voice,x="label_female",y="meanfun",ax=axs[0], palette = ["deepskyblue","pink"])
sns.boxplot(data=df_voice,x="label_female",y="IQR",ax=axs[1], palette = ["deepskyblue","pink"])
sns.boxplot(data=df_voice,x="label_female",y="Q25",ax=axs[2], palette = ["deepskyblue","pink"])
sns.boxplot(data=df_voice,x="label_female",y="sp.ent",ax=axs[3], palette = ["deepskyblue","pink"])
sns.boxplot(data=df_voice,x="label_female",y="sd",ax=axs[4], palette = ["deepskyblue","pink"])

# Różnice pomiędzy kobietami i mężczyznami 

In [None]:
df_voice1 = pd.read_csv("./src/gender_voice_dataset.csv")
females = df_voice1[df_voice1.label == "female"]
males = df_voice1[df_voice1.label == "male"]
columns = df_voice1[['meanfun','IQR','sp.ent','Q25','sd']].columns
columns = columns.tolist()
for column in columns:
    sns.histplot(data = df_voice1, x = column, hue = "label", bins = 20, multiple = "dodge", palette = ["deepskyblue","pink"])
    plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(data=df_voice,x="IQR",y="meanfun",hue="label_female", palette = ["deepskyblue","pink"])
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(data=df_voice,x="meanfun",y="Q25",hue="label_female", palette = ["deepskyblue","pink"])
plt.show()