# From raw data to data understanding

In [None]:
# imports
import numpy as np
import pandas as pd
import pathlib

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE

## Load data

In [None]:
# load data
data_titanic = pd.read_csv("titanic.csv", index_col="PassengerId")

In [None]:
data_titanic.head(5)

## Data Description and exploration

### Amount of data

In [None]:
# size of the data
n_samples, n_features = data_titanic.shape
print(f"Number of samples:  {n_samples}")
print(f"Number of features: {n_features}")

### Individual features

1) For data description, look for
* Value types
* Coding schemes
* (Missing values)

2) For data exploration, visualize box plots / histograms.

#### Sex

In [None]:
# check values
data_titanic["Sex"].value_counts()

In [None]:
# visualize values
sns.histplot(data_titanic["Sex"].astype("category"))

#### Age

In [None]:
# check values
data_titanic["Age"].value_counts()

In [None]:
# visualize values
sns.histplot(data_titanic["Age"])

## Between samples

### Create sample data

In [None]:
x = np.random.multivariate_normal([0,0], [[1,0], [0,1]], size=200)
X = np.concatenate([
    np.random.multivariate_normal([0,0], [[1,0], [0,1]], size=200) + [0,0],
    np.random.multivariate_normal([0,0], [[1,0], [0,1]], size=200) + [0,10],
    np.random.multivariate_normal([0,0], [[1,0], [0,1]], size=200) + [5,5],
    np.random.multivariate_normal([0,0], [[1,0], [0,1]], size=200) + [12,5]])
colors = np.repeat([1,2,3,4], 200)


### Visualize individual axes

In [None]:
plt.scatter(X[:,0], np.random.random(X.shape[0]), c=colors)

### Dimensionality reduction

In [None]:
emb = TSNE(n_components=1).fit_transform(X)
plt.scatter(emb, np.random.random(X.shape[0]), c=colors)