In [1]:
# Importing the required libraries for visualization 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os 
os.chdir('data')

# Visualization Prefrences.
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [2]:
%pip install xlrd==1.2.0

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## 01- Data Description Section

In [6]:
import os
cwd = os.getcwd()   #get the current working directory
cwd

'/Users/shinichisato/github/Dry-Bean-Dataset/Data'

In [3]:
# Data Retrieving
excel_file_path = "Data/Dry_Bean_Dataset.xlsx"

df = pd.read_excel(excel_file_path,  sheet_name="Dry_Beans_Dataset")
df.head(8).T

FileNotFoundError: [Errno 2] No such file or directory: 'Data/Dry_Bean_Dataset.xlsx'

In [None]:
df.shape

In [None]:
# Extract Descriptive Data.
pd.set_option("display.float", "{:.2f}".format)
df.describe().T

## 02- Exploratory Data Analysis (EDA) & Feature Engineering Section 

In [None]:
# Check for Null Values
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df_class_counts = df.Class.value_counts().to_frame()
print(df_class_counts)
df.Class.value_counts().plot(kind="bar", color='#006C31', figsize=(10,5), title = 'Bean Classes Counts')

In [None]:
# Check the percentage of each class 
df_class_perc = df.Class.value_counts(normalize=True).to_frame()
df_class_perc["Class"] = df_class_perc["Class"] * 100 
df_class_perc.rename(columns = {"Class": "Class%"}, inplace=True)
df_class_perc

In [None]:
# Studying the correlations between features using Heat Map!
corr_matrix = df.corr()
for x in range(corr_matrix.shape[0]):
    corr_matrix.iloc[x,x] = 0.0

fig, ax = plt.subplots(figsize=(16, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
# The correlation matrix
corr_mat = df.corr()

# Strip out the diagonal values for the next step
for x in range(corr_mat.shape[0]):
    corr_mat.iloc[x,x] = 0.0
    
corr_mat

In [None]:
# Pairwise maximal correlations 
corr_max  = corr_mat.abs().max().to_frame()
corr_id_max = corr_mat.abs().idxmax().to_frame()

# dataframe aggrigation and processing
pair_features_corr = pd.merge(corr_id_max, corr_max, on = corr_max.index)
pair_features_corr = pair_features_corr.rename(columns = {'key_0':'Feature_one', '0_x':'Feature_two', '0_y':'correlation'})\
                                                .sort_values('correlation', ascending=False)\
                                                .reset_index().drop('index', axis=1)
pair_features_corr


In [None]:
float_columns = [col for col in df.columns if col != 'Class']

sns.set_context('notebook')
sns.pairplot(df[float_columns + ['Class']], 
             hue='Class'
             );
### END SOLUTION

And an examination of the skew values in anticipation of transformations.
- 0 : no skew
- pos : right skew
- neg : left skew

In [None]:
skew_columns = (df
                .skew()
                .sort_values(ascending=False)).to_frame("skewness_value")
skew_columns

In [None]:
skew_columns = (df
                .skew()
                .sort_values(ascending=False)).to_frame("skewness_value")
skew_columns = skew_columns.query('skewness_value > 0.75')
skew_columns

In [None]:
# Perform log transform on skewed columns
for col in skew_columns['skewness_value'].index.tolist():
    df[col] = np.log1p(df[col]) 

In [None]:
skew_trans_columns = (df
                .skew()
                .sort_values(ascending=False)).to_frame("skewness_value")
skew_trans_columns

In [None]:
df.dtypes

In [None]:
df.head(7)

In [None]:
from sklearn.preprocessing import StandardScaler
float_columns = [col for col in df.columns if col != 'Class']
sc = StandardScaler()
df[float_columns] = sc.fit_transform(df[float_columns])
df.head(7)

## 03- Machine Learning Section : clusterning methods 


### 1- K-means Algorithm

In [None]:
### BEGIN SOLUTION
from sklearn.cluster import KMeans

# Create and fit a range of models
km_list = list()

for clust in range(1,21):
    km = KMeans(n_clusters=clust, random_state=42)
    km = km.fit(df[float_columns])
    
    km_list.append(pd.Series({'clusters': clust, 
                              'inertia': km.inertia_,
                              'model': km}))




In [None]:
plot_data = (pd.concat(km_list, axis=1)
             .T
             [['clusters','inertia']]
             .set_index('clusters'))

ax = plot_data.plot(marker='o',ls='-')
ax.set_xticks(range(0,21,2))
ax.set_xlim(0,21)
ax.set(xlabel='Cluster', ylabel='Inertia');
### END SOLUTION

In [None]:
Bean_classes = pd.merge(df_class_counts, df_class_perc, on = df_class_counts.index)
Bean_classes.rename(columns={'key_0':'class_name'}, inplace=True)
Bean_classes

In [None]:
### BEGIN SOLUTION
km = KMeans(n_clusters=8, random_state=42)
km = km.fit(df[float_columns])
df['k-means'] = km.predict(df[float_columns])
df.sample(7)

In [None]:
# Group by Class and K-means for comparsion between clustered classes and actual classes 
(df[['Class','k-means']]
 .groupby(['Class','k-means'])
 .size()
 .to_frame()
 .rename(columns={0:'number'}))
### END SOLUTION


### 2- Agglomerative Algorithm 

In [None]:
from sklearn.cluster import AgglomerativeClustering
### BEGIN SOLUTION
ag = AgglomerativeClustering(n_clusters=7, linkage='ward', compute_full_tree=True)
ag = ag.fit(df[float_columns])
df['agglom'] = ag.fit_predict(df[float_columns])

In [None]:
# First, we import the cluster hierarchy module from SciPy (described above) to obtain the linkage and dendrogram functions.
from scipy.cluster import hierarchy

Z = hierarchy.linkage(ag.children_, method='ward')

fig, ax = plt.subplots(figsize=(15,5))

den = hierarchy.dendrogram(Z, orientation='top', 
                           p=30, truncate_mode='lastp',
                           show_leaf_counts=True, ax=ax,
                           above_threshold_color='blue')
### END SOLUTION

In [None]:
# First, for Agglomerative Clustering:
(df[['Class','agglom']]
 .groupby(['Class','agglom'])
 .size()
 .to_frame()
 .rename(columns={0:'number'}))

### 3- MeanShift Algorithm 


In [None]:
from sklearn.cluster import MeanShift
ms  = MeanShift(bandwidth=2.8, n_jobs=-1) 
ms = ms.fit(df[float_columns])

In [None]:
np.unique(ms.labels_)

In [None]:
df['MeanShift'] = ms.fit_predict(df[float_columns])

In [None]:
(df[['Class','MeanShift']]
 .groupby(['Class','MeanShift'])
 .size()
 .to_frame()
 .rename(columns={0:'number'}))

### 4- DBSCAN Algorithm 


In [None]:
from sklearn.cluster import DBSCAN
dbs = DBSCAN(eps=0.5, min_samples=11, metric='euclidean')
dbs = dbs.fit(df[float_columns])

In [None]:
np.unique(dbs.labels_)

In [None]:
df['dbscan'] = dbs.fit_predict(df[float_columns])

In [None]:
# First, for Agglomerative Clustering:
(df[['Class','dbscan']]
 .groupby(['Class','dbscan'])
 .size()
 .to_frame()
 .rename(columns={0:'number'}))