# Data Slicing

Used dataset: [Iris](https://archive.ics.uci.edu/ml/datasets/Iris).

### 1. Clean Up

In [1]:
import numpy as np
import pandas as pd

In [3]:
#%config Completer.use_jedi = False

In [5]:
#!pip install --upgrade jedi==0.17.2

In [32]:
df = pd.read_csv('iris/iris.data',header=None)

In [34]:
df.columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [35]:
df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [36]:
df['class'] = df['class'].replace({
    'Iris-setosa': 'setosa',
    'Iris-versicolor': 'versicolor',
    'Iris-virginica': 'virginica'
})

In [37]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [38]:
df.to_csv('iris/iris.csv', sep=',', header=True, index=False)

### 2. Exercise

> Load the data using Pandas and then write a function that outputs the descriptive stats for each numeric feature while the categorical variable is held fixed. Run this function for each of the four numeric variables in the Iris data set.

In [39]:
df = pd.read_csv('iris/iris.csv')

In [40]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [41]:
def slice_iris(df, feature):
    """ Function for calculating descriptive stats on slices of the Iris dataset."""
    for cls in df["class"].unique():
        # Alternative:
        # df.groupby('class').mean()
        # df.groupby('class').std()
        df_temp = df[df["class"] == cls]
        mean = df_temp[feature].mean()
        stddev = df_temp[feature].std()
        print(f"Class: {cls}")
        print(f"{feature} mean: {mean:.4f}")
        print(f"{feature} stddev: {stddev:.4f}")
    print()

In [42]:
slice_iris(df, "sepal_length")
slice_iris(df, "sepal_width")
slice_iris(df, "petal_length")
slice_iris(df, "petal_width")

Class: setosa
sepal_length mean: 5.0060
sepal_length stddev: 0.3525
Class: versicolor
sepal_length mean: 5.9360
sepal_length stddev: 0.5162
Class: virginica
sepal_length mean: 6.5880
sepal_length stddev: 0.6359

Class: setosa
sepal_width mean: 3.4180
sepal_width stddev: 0.3810
Class: versicolor
sepal_width mean: 2.7700
sepal_width stddev: 0.3138
Class: virginica
sepal_width mean: 2.9740
sepal_width stddev: 0.3225

Class: setosa
petal_length mean: 1.4640
petal_length stddev: 0.1735
Class: versicolor
petal_length mean: 4.2600
petal_length stddev: 0.4699
Class: virginica
petal_length mean: 5.5520
petal_length stddev: 0.5519

Class: setosa
petal_width mean: 0.2440
petal_width stddev: 0.1072
Class: versicolor
petal_width mean: 1.3260
petal_width stddev: 0.1978
Class: virginica
petal_width mean: 2.0260
petal_width stddev: 0.2747



In [46]:
df.groupby('class').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.418,1.464,0.244
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [47]:
df.groupby('class').std()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,0.35249,0.381024,0.173511,0.10721
versicolor,0.516171,0.313798,0.469911,0.197753
virginica,0.63588,0.322497,0.551895,0.27465
