In [None]:
# essentials
import pandas as pd
import numpy as np

# models and training

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import AgglomerativeClustering

# visualization
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# to make the the notebook consistent among the runs
np.random.seed(42)

reading in the features:

In [None]:
labels = pd.read_csv("../input/dataset10/labels.csv")
melMFCC = pd.read_csv("../input/dataset10/melMFCC (3).csv")
cwt = pd.read_csv("../input/dataset10/cwt (1).csv")
spectrum = pd.read_csv("../input/dataset10/spectrum (3).csv")
zcrossRMS = pd.read_csv("../input/dataset10/zcrossRMS.csv")

data = pd.concat([melMFCC, cwt, spectrum, zcrossRMS], axis=1)

filling in the missing data:

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")
imputer.fit(data)
data[:] = imputer.transform(data)

standardizing the data:

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(data)
dataset =  scaler.transform(data)

pca for visualization:

In [None]:
from sklearn.decomposition import PCA
pca_2d = PCA(n_components=2)

Hierarical clustering:

In [None]:
model = AgglomerativeClustering(n_clusters=5, linkage="ward")
y_pred = model.fit_predict(data)
data["Cluster"] = y_pred

finding

In [None]:
unique1 = pd.DataFrame(y_pred)[:270].value_counts()
print(unique1)
print(np.sort(unique1)[::-1]/ np.sum(unique1))
# 0 = 1

In [None]:
unique2 = pd.DataFrame(y_pred)[270:550].value_counts()
print(unique2)
print(np.sort(unique2)[::-1]/np.sum(unique2))
# 3 = 2

In [None]:
unique3 = pd.DataFrame(y_pred)[550:838].value_counts()
print(unique3)
print(np.sort(unique3)[::-1]/np.sum(unique3))
# 1 = 3

In [None]:
unique4 = pd.DataFrame(y_pred)[838:1113].value_counts()
print(unique4)
print(np.sort(unique4)[::-1]/np.sum(unique4))
# 2 = 4

In [None]:
unique5 = pd.DataFrame(y_pred)[1113:].value_counts()
print(unique5)
print(np.sort(unique5)[::-1]/np.sum(unique5))
# 4 = 5

In [None]:
y_new = np.copy(y_pred)
y_new[y_pred == 0] = 1
y_new[y_pred == 3] = 4
y_new[y_pred == 1] = 3
y_new[y_pred == 2] = 5
y_new[y_pred == 4] = 2

it looks like neyman pearson criterion: almost all the data has been clustered in 3 clusters and the clustering algorithm is trying to minimize 1 specefic error type. it seems like lori and kordi are very similar to other classes of the data and the clustering algorithm does not spot a difference.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

df_cm = confusion_matrix(labels, y_new)
fig, ax = plt.subplots(figsize = (10,8))


sns.heatmap(df_cm, annot=True, annot_kws={"size": 14},cmap="YlGnBu", fmt='g');
ax.set_title("Confusion Matrix")
ax.xaxis.set_ticklabels(['Lori', 'Kordi', 'Torki', 'Bandari', 'Gilaki']);
ax.yaxis.set_ticklabels(['Lori', 'Kordi', 'Torki', 'Bandari', 'Gilaki']);

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of Agglomarative")
print("Accuracy: ", accuracy_score(labels, y_new))
from sklearn.metrics import classification_report
print(classification_report(labels, y_new,  target_names= ['Lori', 'Kordi', 'Torki', 'Bandari', 'Gilaki']) )

In [None]:
plotX = data

In [None]:
from sklearn.decomposition import PCA
pca_2d = PCA(n_components=2)

PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

plotX["dummy"] = 0

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
cluster3 = plotX[plotX["Cluster"] == 3]
cluster4 = plotX[plotX["Cluster"] == 4]

In [None]:
pca_2d = PCA(n_components=2)

In [None]:
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

In [None]:
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

In [None]:
plotX = pd.concat([plotX,PCs_2d], axis=1, join='inner')

In [None]:
plotX["dummy"] = 0

In [None]:
cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
cluster3 = plotX[plotX["Cluster"] == 3]
cluster4 = plotX[plotX["Cluster"] == 4]

In [None]:
init_notebook_mode(connected=True)

In [None]:
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(128, 128, 128, 0.8)'),
                    text = None)

trace4 = go.Scatter(
                    x = cluster3["PC1_2d"],
                    y = cluster3["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 255, 255, 0.8)'),
                    text = None)

trace5 = go.Scatter(
                    x = cluster4["PC1_2d"],
                    y = cluster4["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 4",
                    marker = dict(color = 'rgba(145, 30, 180, 0.4)'),
                    text = None)


data = [trace1, trace2, trace3, trace4, trace5]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [None]:
pca_3d = PCA(n_components=3)

In [None]:
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

In [None]:
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

In [None]:
plotX = pd.concat([plotX, PCs_3d], axis=1, join='inner')

In [None]:
plotX["dummy"] = 0

In [None]:
cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
cluster3 = plotX[plotX["Cluster"] == 3]
cluster4 = plotX[plotX["Cluster"] == 4]

In [None]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(128, 128, 0, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(128, 128, 128, 0.8)'),
                    text = None)

trace4 = go.Scatter3d(
                    x = cluster3["PC1_3d"],
                    y = cluster3["PC2_3d"],
                    z = cluster3["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 255, 255, 0.8)'),
                    text = None)

trace5 = go.Scatter3d(
                    x = cluster4["PC1_3d"],
                    y = cluster4["PC2_3d"],
                    z = cluster4["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 4",
                    marker = dict(color = 'rgba(145, 30, 180, 0.4)'),
                    text = None)

data = [trace1, trace2, trace3, trace4, trace5]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False),
              width=1400,
              height=1000
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5).fit(dataset)
y_pred = kmeans.predict(dataset)

In [None]:
unique1 = pd.DataFrame(y_pred)[:270].value_counts()
print(unique1)
print(np.sort(unique1)[::-1]/ np.sum(unique1))
# 0 = 1

In [None]:
unique2 = pd.DataFrame(y_pred)[270:550].value_counts()
print(unique2)
print(np.sort(unique2)[::-1]/np.sum(unique2))
# 4 = 2


In [None]:
unique3 = pd.DataFrame(y_pred)[550:838].value_counts()
print(unique3)
print(np.sort(unique3)[::-1]/np.sum(unique3))
# 1 = 3


In [None]:
unique4 = pd.DataFrame(y_pred)[838:1113].value_counts()
print(unique4)
print(np.sort(unique4)[::-1]/np.sum(unique4))
# 3 = 4


In [None]:
unique5 = pd.DataFrame(y_pred)[1113:].value_counts()
print(unique5)
print(np.sort(unique5)[::-1]/np.sum(unique5))
# 2 = 5


In [None]:
y_new = np.copy(y_pred)
y_new[y_pred == 0] = 5
y_new[y_pred == 4] = 3
y_new[y_pred == 1] = 4
y_new[y_pred == 3] = 2
y_new[y_pred == 2] = 1

In [None]:
y_new = y_new.reshape(y_new.shape[0], 1)
np.sum(y_new == labels) / len(labels)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

df_cm = confusion_matrix(labels, y_new)
fig, ax = plt.subplots(figsize = (10,8))


sns.heatmap(df_cm, annot=True, annot_kws={"size": 14},cmap="YlGnBu", fmt='g');
ax.set_title("Confusion Matrix")
ax.xaxis.set_ticklabels(['Lori', 'Kordi', 'Torki', 'Bandari', 'Gilaki']);
ax.yaxis.set_ticklabels(['Lori', 'Kordi', 'Torki', 'Bandari', 'Gilaki']);

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of K-Means")
print("Accuracy: ", accuracy_score(labels, y_new))
from sklearn.metrics import classification_report
print(classification_report(labels, y_new,  target_names= ['Lori', 'Kordi', 'Torki', 'Bandari', 'Gilaki']) )

In [None]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(128, 128, 0, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(128, 128, 128, 0.8)'),
                    text = None)

trace4 = go.Scatter3d(
                    x = cluster3["PC1_3d"],
                    y = cluster3["PC2_3d"],
                    z = cluster3["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 255, 255, 0.8)'),
                    text = None)

trace5 = go.Scatter3d(
                    x = cluster4["PC1_3d"],
                    y = cluster4["PC2_3d"],
                    z = cluster4["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 4",
                    marker = dict(color = 'rgba(145, 30, 180, 0.4)'),
                    text = None)

data = [trace1, trace2, trace3, trace4, trace5]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False),
              width=1400,
              height=1000
             )

fig = dict(data = data, layout = layout)

iplot(fig)