# Data exploration

Explore the housing data that was passed by the previous step.

For explanatory reasons we use Vaex (https://github.com/vaexio/vaex) instead of the familiar `pandas` to represent our dataframes. Namely, the packages `orchest`, `matplotlib` and `sklearn` are included by default, i.e. you don't have to install the dependencies using `pip` for example. If you explore the *Environments* in the left pane menu, you will see that we installed Vaex in the setup script.

In [None]:
from matplotlib import pyplot as plt
import orchest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import vaex as vx

In [None]:
# Retrieve the data from the previous step.
data = orchest.get_inputs()  # data = [(df_data, df_target)]
df_data, df_target = data["data"]

# Convert the pandas dataframes to vaex dataframes.
data, target = vx.from_pandas(df_data), vx.from_pandas(df_target)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Plot the counts for the different features to see how they are
# distributed.
plt.style.use('ggplot')
fig, axs = plt.subplots(3, 3, figsize=(15, 10))

for i, col in enumerate(data.column_names):
    plt.sca(axs[i//3][i%3])
    data.plot1d(col, shape=(64))

plt.show()

### PCA
Let's use PCA to reduce the number of features to two, then plot with respect to the target feature.


In [None]:
# First, we scale the input features before we can use PCA.
X = data.values
X = StandardScaler().fit_transform(X)
y = target.values

In [None]:
# Run PCA over the features.
pca = PCA(n_components=2)
components = pca.fit_transform(X)

In [None]:
# Plot the principal components against the target feature.
plt.style.use('default')
fig = plt.figure(figsize=(20, 15))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(components[:, 0], components[:, 1], y)
ax.set_xlim([-10, 10])

ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Target')

plt.show()