## Gender Recognition by Voice

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
import scipy.stats as stats
import copy
import warnings
warnings.filterwarnings('ignore')

In [None]:
import operator
from collections import defaultdict

In [None]:
voice = pd.read_csv('../input/voicegender/voice.csv')
voice.head()

In [None]:
voice.info()

In [None]:
voice.describe().transpose()

In [None]:
voice.describe(include='object')

In [None]:
voice.isnull().sum()

#### We will predict gender by voice, we can already move the dependent variable into the format we need.

In [None]:
voice['label'] = voice['label'].map({'male': 1, 'female': 0})
voice.head()

In [None]:
sns.countplot(voice['label'])

In [None]:
_, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(voice.corr(), annot=True, linewidths=.5, fmt= '.1f', ax=ax)

plt.title('Correlation', fontsize = 30)
plt.xlabel('Features', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

plt.show()

Most correlated with gender(label):
 - meanfun: average of fundamental frequency measured across acoustic signal (-0.8);
 - IQR: interquantile range (0.6);
 - sd: standard deviation of frequency (0.5);
 - Q25: first quantile (-0.5);
 - sp.ent: spectral entropy (0.5)

#### Consider if there is any relationship between them.

In [None]:
sns.pairplot(voice[['meanfun', 'IQR', 'sd', 'Q25', 'sp.ent', 'label']], hue = 'label', size = 3)
plt.show()

In [None]:
fig = plt.figure(figsize = (20, 15))

c = 0
for i in voice.columns:
    plt.subplot(5, 5, c+1)
    c += 1
    sns.distplot(voice[i][voice['label']==0], label = 'Female')
    sns.distplot(voice[i][voice['label']==1],  label = 'Male')
    plt.legend(loc='best')
fig.suptitle('Voice Data Analysis')
fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.show()

Some of the features are similar:
 - Q75: third quantile (in kHz);
 - minfun: minimum fundamental frequency measured across acoustic signal;
 - maxfun: maximum fundamental frequency measured across acoustic signal;
 - modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range


#### Therefore, we will delete them.

In [None]:
voice.drop(labels=['Q75', 'minfun', 'maxfun', 'modindx'], axis=1, inplace=True)
voice.info()

In [None]:
voice.nunique()

There are 3168 rows in the data. 
If there are 1366 unique values in any column, then it is better to delete them.
Because this will not help in prediction.

In [None]:
voice.drop(labels=['meanfreq', 'sd', 'skew', 'kurt', 'sp.ent', 'sfm', 'centroid', 'meanfun', 'median'],
          axis=1, inplace=True)
voice.info()

#### We have considered the data, prepared and can proceed to the prediction.
Let's build several models and compare which one is the best.

In [None]:
x = voice.iloc[:, :-1].values
y = voice.iloc[:, -1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [None]:
voice.describe()

We see that the scale is different and it is better to normalize the independent variables in order to build a correct model.

In [None]:
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Logistic Regression model

In [None]:
model_logr = LogisticRegression(random_state=0)

In [None]:
model_logr.fit(x_train, y_train)

In [None]:
yhat_logr = model_logr.predict(x_test)

In [None]:
ac_logr = accuracy_score(y_test, yhat_logr)
ac_logr

In [None]:
cm_logr = confusion_matrix(y_test, yhat_logr)
cm_logr

In [None]:
cr_logr = classification_report(y_test, yhat_logr)
print(cr_logr)

### Support Vector Machine model

In [None]:
model_svm = SVC(kernel='linear', random_state=0)

In [None]:
model_svm.fit(x_train, y_train)

In [None]:
yhat_svm = model_svm.predict(x_test)

In [None]:
ac_svm = accuracy_score(y_test, yhat_svm)
ac_svm

In [None]:
cm_svm = confusion_matrix(y_test, yhat_svm)
cm_svm

In [None]:
cr_svm = classification_report(y_test, yhat_svm)
print(cr_svm)

### K-Nearest Neighbors model 

First, we need to choose at which k the model will have the best result. How to evaluate the result? Accuracy.
In the previous models, we calculated everything manually, now it will be more convenient for us to create a function.

In [None]:
def calculate_accuracy(model, x_train, y_train, x_test, y_test):
    
    model.fit(x_train, y_train)
    yhat = model.predict(x_test)
    acuracy = accuracy_score(y_test, yhat)

    return acuracy

In [None]:
def choose_k(k, x_train, y_train, x_test, y_test):
    accuracies = []
    K = range(1, k)
    for i in K:
        knn = KNeighborsClassifier(n_neighbors=i, metric='minkowski', p=2)
        accuracies.append(calculate_accuracy(knn, x_train, y_train, x_test, y_test))
        
    plt.plot(K, accuracies, linestyle='dashed', marker='o', markerfacecolor='red')
    plt.xlabel('Number of K')
    plt.ylabel('Accuracy')
    plt.show()
    print(f"Best K = {K[accuracies.index(max(accuracies))]}")

In [None]:
choose_k(25, x_train, y_train, x_test, y_test)

In [None]:
model_knn = KNeighborsClassifier(n_neighbors=11, metric='minkowski', p=2)

In [None]:
model_knn.fit(x_train, y_train)

In [None]:
yhat_knn = model_knn.predict(x_test)

In [None]:
ac_knn = accuracy_score(y_test, yhat_knn)
ac_knn

In [None]:
cm_knn = confusion_matrix(y_test, yhat_knn)
cm_knn

In [None]:
cr_knn = classification_report(y_test, yhat_knn)
print(cr_knn)

### Naive Bayes model

In [None]:
model_nb = GaussianNB()

In [None]:
model_nb.fit(x_train, y_train)

In [None]:
yhat_nb = model_nb.predict(x_test)

In [None]:
ac_nb = accuracy_score(y_test, yhat_nb)
ac_nb

In [None]:
cm_nb = confusion_matrix(y_test, yhat_nb)
cm_nb

In [None]:
cr_nb = classification_report(y_test, yhat_nb)
print(cr_nb)

### Decision Tree model

In [None]:
model_dt = DecisionTreeClassifier(criterion = 'gini', random_state = 0)

In [None]:
model_dt.fit(x_train, y_train)

In [None]:
yhat_dt = model_dt.predict(x_test)

In [None]:
ac_dt = accuracy_score(y_test, yhat_dt)
ac_dt

In [None]:
cm_dt = confusion_matrix(y_test, yhat_dt)
cm_dt

In [None]:
cr_dt = classification_report(y_test, yhat_dt)
print(cr_dt)