# Unsupervised-Model-for-Plant Allocation

In [18]:
# Initial imports
import pandas as pd
import hvplot.pandas
import matplotlib.pyplot as plt
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html


### Deliverable 1: Preprocessing the Data for PCA

In [19]:
# Load the Plant_DBcopy.csv dataset.
file_path = "Resources/Plant_DBcopy.csv"

#file_path = "Resources/Base_limpia_definitiva.csv"
#plants_df = pd.read_csv(file_path, encoding='latin-1', error_bad_lines=False)
plants_df = pd.read_csv(file_path, encoding='latin-1', error_bad_lines=False, engine='python')
plants_df



  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,División,clase,orden,familia,Hierba,Color,longitud,latitud,tipovegetacionmapa,altitudmapa
0,Magnoliophyta,Liliopsida,Alismatales,Alismataceae,Hierba,Blanca,-110.433889,27.783333,Vegetacion Halofila,9
1,Magnoliophyta,Liliopsida,Alismatales,Alismataceae,Hierba,Blanca,-110.996667,28.111667,Matorral Sarcocaule,79
2,Magnoliophyta,Liliopsida,Alismatales,Alismataceae,Hierba,Blanca,-99.325000,24.441667,Matorral Submontano,271
3,Magnoliophyta,Liliopsida,Alismatales,Alismataceae,Hierba,Blanca,-99.325000,24.441667,Matorral Submontano,271
4,Magnoliophyta,Liliopsida,Alismatales,Alismataceae,Hierba,Blanca,-106.000000,28.500000,Pastizal Natural,1572
...,...,...,...,...,...,...,...,...,...,...
1186,Magnoliophyta,Magnoliopsida,Sapindales,Simaroubaceae,Hierba,Naranja,-100.850000,23.950000,Vegetacion Halofila,1741
1187,Magnoliophyta,Magnoliopsida,Sapindales,Simaroubaceae,Hierba,Naranja,-100.983333,23.466667,Matorral Desertico Microfilo,1913
1188,Magnoliophyta,Magnoliopsida,Violales,Bixaceae,Hierba,Naranja,-110.808610,30.470280,Pastizal Natural,1049
1189,Magnoliophyta,Magnoliopsida,Violales,Bixaceae,Hierba,Naranja,-105.830000,28.580000,Pastizal Natural,1649


In [None]:
# Create a new DataFrame that holds clase names.
plants_df_name = plants_df.filter(['clase'], axis=1)
plants_df_name.head()

In [None]:
plants_df

In [None]:
# Remove rows that have at least 1 null value.
plants_df = plants_df.dropna(how='any',axis=0) 
plants_df

In [None]:
# Remove the "altitudmapa" column. 
# Keep all the Plants that are being traded.
indexNames = plants_df[(plants_df["altitudmapa"] == 'False')].index
plants_df.drop(indexNames, inplace = True)
plants_df.head()

In [None]:
plants_df.drop("familia", axis = 1, inplace = True)
plants_df.head()

In [None]:
plants_df.drop("División", axis = 1, inplace = True)
plants_df.head()

In [None]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(plants_df, columns=["clase", "orden", "Hierba", "Color", "tipovegetacionmapa"])
X

In [None]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled)

In [None]:
#pd.plotting.scatter_matrix(X, alpha=0.2, figsize=(30,10))

In [None]:
corr = X.corr()
plt.figure(num=None, figsize=(30, 10), dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
plt.show()

### Deliverable 2: Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=4)
#X_pca = pca.fit_transform(X_scaled)
X_pca = pca.fit_transform(X)
X_pca

### Check why we are using three principal components ###

In [None]:
# Create a DataFrame with the three principal components.
pcs_df=pd.DataFrame(
data=X_pca, columns=["PC 1", "PC 2","PC 3", "PC 4"], index = plants_df.index)
pcs_df.head(10)

In [None]:
#pca.explained_variance_ratio_.cumsum()
pca.explained_variance_ratio_

In [None]:
pcs_df.plot(kind = 'scatter', x = "PC 1", y = "PC 2")

### Deliverable 3: Clustering Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
#k = list(range(1, 11))
k = list(range(1, 11))
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)

### Check why the range is from 1 to 11 ###

In [None]:
# CREATE A DATAFRAME AND PLOT THE WLBOW CURVE

elbow_data = {"k":k, "inertia":inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

predictions

In [None]:
# Create a new DataFrame including predicted clusters and Plant Allocation features.
# Concatentate the clustered_df and pcs_df plants_df on the same columns.
clustered_df = pd.concat([plants_df, pcs_df], axis=1)


#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the Plant Names. 
clustered_df['PlantsName'] = plants_df_name 

### The name is not important for clustering, that is why we add it at the end of the process ###


#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions


# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Deliverable 4: Visualizing Plant Names Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="familia",
    hover_data=['Hierba'],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
# Create a table with the Plant Allocation.
clustered_df.hvplot.table(sortable=True, selectable=True)

In [None]:
# Print the total number of Plant Allocation.
index = clustered_df.index
len(index)
print(str(len(index)))