In [25]:
#Basic imports
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [26]:
df = pd.read_csv("covtype.csv")
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [27]:
X = df.copy()

In [28]:
X["Distance_To_Hydrology"] = ( (X["Horizontal_Distance_To_Hydrology"] ** 2) + (X["Vertical_Distance_To_Hydrology"] ** 2) ) ** (0.5)

In [29]:
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Distance_To_Hydrology
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,5,258.0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,5,212.084889
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,2,275.769832
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,2,269.235956
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,5,153.003268


In [30]:
X.drop(["Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology"], axis=1, inplace=True)

In [31]:
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Distance_To_Hydrology
0,2596,51,3,510,221,232,148,6279,1,0,...,0,0,0,0,0,0,0,0,5,258.0
1,2590,56,2,390,220,235,151,6225,1,0,...,0,0,0,0,0,0,0,0,5,212.084889
2,2804,139,9,3180,234,238,135,6121,1,0,...,0,0,0,0,0,0,0,0,2,275.769832
3,2785,155,18,3090,238,238,122,6211,1,0,...,0,0,0,0,0,0,0,0,2,269.235956
4,2595,45,2,391,220,234,150,6172,1,0,...,0,0,0,0,0,0,0,0,5,153.003268


In [32]:
X['Cover_Type'].replace({1:'Spruce/Fir', 2:'Lodgepole Pine', 3:'Ponderosa Pine', 4:'Cottonwood/Willow', 5:'Aspen', 6:'Douglas-fir', 7:'Krummholz'}, inplace=True)

In [33]:
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Distance_To_Hydrology
0,2596,51,3,510,221,232,148,6279,1,0,...,0,0,0,0,0,0,0,0,Aspen,258.0
1,2590,56,2,390,220,235,151,6225,1,0,...,0,0,0,0,0,0,0,0,Aspen,212.084889
2,2804,139,9,3180,234,238,135,6121,1,0,...,0,0,0,0,0,0,0,0,Lodgepole Pine,275.769832
3,2785,155,18,3090,238,238,122,6211,1,0,...,0,0,0,0,0,0,0,0,Lodgepole Pine,269.235956
4,2595,45,2,391,220,234,150,6172,1,0,...,0,0,0,0,0,0,0,0,Aspen,153.003268


In [34]:
X = pd.get_dummies(X)

In [35]:
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type39,Soil_Type40,Distance_To_Hydrology,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir
0,2596,51,3,510,221,232,148,6279,1,0,...,0,0,258.0,1,0,0,0,0,0,0
1,2590,56,2,390,220,235,151,6225,1,0,...,0,0,212.084889,1,0,0,0,0,0,0
2,2804,139,9,3180,234,238,135,6121,1,0,...,0,0,275.769832,0,0,0,0,1,0,0
3,2785,155,18,3090,238,238,122,6211,1,0,...,0,0,269.235956,0,0,0,0,1,0,0
4,2595,45,2,391,220,234,150,6172,1,0,...,0,0,153.003268,1,0,0,0,0,0,0


In [36]:
#numer is the DataFrame that holds all of X's numerical variables
numer = X[["Elevation","Aspect","Slope","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm","Horizontal_Distance_To_Fire_Points","Distance_To_Hydrology"]]

In [37]:
cater = X[["Wilderness_Area1","Wilderness_Area2","Wilderness_Area3","Wilderness_Area4","Soil_Type1","Soil_Type2","Soil_Type3","Soil_Type4","Soil_Type5","Soil_Type6","Soil_Type7","Soil_Type8","Soil_Type9","Soil_Type10","Soil_Type11","Soil_Type12","Soil_Type13","Soil_Type14","Soil_Type15","Soil_Type16","Soil_Type17","Soil_Type18","Soil_Type19","Soil_Type20","Soil_Type21","Soil_Type22","Soil_Type23","Soil_Type24","Soil_Type25","Soil_Type26","Soil_Type27","Soil_Type28","Soil_Type29","Soil_Type30","Soil_Type31","Soil_Type32","Soil_Type33","Soil_Type34","Soil_Type35","Soil_Type36","Soil_Type37","Soil_Type38","Soil_Type39","Soil_Type40","Cover_Type_Aspen","Cover_Type_Cottonwood/Willow","Cover_Type_Douglas-fir","Cover_Type_Krummholz","Cover_Type_Lodgepole Pine","Cover_Type_Ponderosa Pine","Cover_Type_Spruce/Fir"]]

In [38]:
numer.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Distance_To_Hydrology
0,2596,51,3,510,221,232,148,6279,258.0
1,2590,56,2,390,220,235,151,6225,212.084889
2,2804,139,9,3180,234,238,135,6121,275.769832
3,2785,155,18,3090,238,238,122,6211,269.235956
4,2595,45,2,391,220,234,150,6172,153.003268


In [39]:
cater.head()

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,...,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [40]:
scaler = StandardScaler()

In [41]:
numer = pd.DataFrame(scaler.fit_transform(numer))

In [42]:
numer.columns = ["Elevation_Scaled","Aspect_Scaled","Slope_Scaled","Horizontal_Distance_To_Roadways_Scaled","Hillshade_9am_Scaled","Hillshade_Noon_Scaled","Hillshade_3pm_Scaled","Horizontal_Distance_To_Fire_Points_Scaled","Distance_To_Hydrology_Scaled"]

In [43]:
X = pd.concat([numer, cater], axis=1, join='inner')

In [44]:
X.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir
0,-1.297805,-0.935157,-1.48282,-1.180146,0.330743,0.439143,0.14296,3.246283,-0.083233,1,...,0,0,0,1,0,0,0,0,0,0
1,-1.319235,-0.89048,-1.616363,-1.257106,0.293388,0.590899,0.221342,3.205504,-0.294777,1,...,0,0,0,1,0,0,0,0,0,0
2,-0.554907,-0.148836,-0.681563,0.532212,0.816364,0.742654,-0.196691,3.126965,-0.001362,1,...,0,0,0,0,0,0,0,1,0,0
3,-0.622768,-0.005869,0.520322,0.474492,0.965786,0.742654,-0.536343,3.194931,-0.031466,1,...,0,0,0,0,0,0,0,1,0,0
4,-1.301377,-0.98877,-1.616363,-1.256464,0.293388,0.540313,0.195215,3.165479,-0.566983,1,...,0,0,0,1,0,0,0,0,0,0


In [45]:
kmeans = KMeans(n_clusters=3)

In [46]:
kmeans.fit(X)

KMeans(n_clusters=3)

In [47]:
clusters = kmeans.predict(X)

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Load the CSV data into a DataFrame
data = pd.read_csv('covtype.csv')

# Select the features to use for clustering (assuming all columns except the target variable)
features = data.drop('Cover_Type', axis=1)

# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
reduced_data = tsne.fit_transform(features)

# Get the cluster labels assigned by the clustering algorithm
cluster_labels = data['cluster']

# Create a scatter plot of the reduced data with different colors for each cluster
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis')
plt.title('t-SNE Visualization of High-Dimensional Clusters')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.colorbar()
plt.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

