# Analysing the Iris Data Set

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# there is a famous data set here
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

In [42]:
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float') # in a np.array all data members must be the same type
iris_2d # we have a 2-dimensional array of floats, plus some labels
df_iris = pd.read_csv(url) # this assumes there will be a header-row. There is none in this data set
df_iris
# irises have 'sepals', 'petals' and 'stamen' as measured in this data set
iris_2d # the strings have become nan

array([[5.1, 3.5, 1.4, 0.2, nan],
       [4.9, 3. , 1.4, 0.2, nan],
       [4.7, 3.2, 1.3, 0.2, nan],
       [4.6, 3.1, 1.5, 0.2, nan],
       [5. , 3.6, 1.4, 0.2, nan],
       [5.4, 3.9, 1.7, 0.4, nan],
       [4.6, 3.4, 1.4, 0.3, nan],
       [5. , 3.4, 1.5, 0.2, nan],
       [4.4, 2.9, 1.4, 0.2, nan],
       [4.9, 3.1, 1.5, 0.1, nan],
       [5.4, 3.7, 1.5, 0.2, nan],
       [4.8, 3.4, 1.6, 0.2, nan],
       [4.8, 3. , 1.4, 0.1, nan],
       [4.3, 3. , 1.1, 0.1, nan],
       [5.8, 4. , 1.2, 0.2, nan],
       [5.7, 4.4, 1.5, 0.4, nan],
       [5.4, 3.9, 1.3, 0.4, nan],
       [5.1, 3.5, 1.4, 0.3, nan],
       [5.7, 3.8, 1.7, 0.3, nan],
       [5.1, 3.8, 1.5, 0.3, nan],
       [5.4, 3.4, 1.7, 0.2, nan],
       [5.1, 3.7, 1.5, 0.4, nan],
       [4.6, 3.6, 1. , 0.2, nan],
       [5.1, 3.3, 1.7, 0.5, nan],
       [4.8, 3.4, 1.9, 0.2, nan],
       [5. , 3. , 1.6, 0.2, nan],
       [5. , 3.4, 1.6, 0.4, nan],
       [5.2, 3.5, 1.5, 0.2, nan],
       [5.2, 3.4, 1.4, 0.2, nan],
       [4.7, 3

In [18]:
# here we inject some problematic data members
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan # persis some missing values
iris_2d

array([[5.1, 3.5, 1.4, 0.2, nan],
       [4.9, 3. , 1.4, 0.2, nan],
       [4.7, 3.2, 1.3, 0.2, nan],
       [4.6, 3.1, 1.5, nan, nan],
       [5. , 3.6, 1.4, 0.2, nan],
       [5.4, 3.9, 1.7, 0.4, nan],
       [4.6, 3.4, 1.4, 0.3, nan],
       [5. , 3.4, 1.5, 0.2, nan],
       [4.4, 2.9, 1.4, 0.2, nan],
       [4.9, 3.1, 1.5, 0.1, nan],
       [5.4, 3.7, 1.5, 0.2, nan],
       [4.8, 3.4, 1.6, 0.2, nan],
       [4.8, 3. , 1.4, 0.1, nan],
       [4.3, 3. , 1.1, 0.1, nan],
       [5.8, 4. , 1.2, 0.2, nan],
       [5.7, 4.4, 1.5, 0.4, nan],
       [5.4, 3.9, 1.3, 0.4, nan],
       [5.1, 3.5, 1.4, 0.3, nan],
       [5.7, 3.8, 1.7, 0.3, nan],
       [5.1, 3.8, 1.5, 0.3, nan],
       [5.4, 3.4, 1.7, 0.2, nan],
       [5.1, 3.7, 1.5, 0.4, nan],
       [4.6, 3.6, 1. , 0.2, nan],
       [5.1, 3.3, 1.7, 0.5, nan],
       [4.8, 3.4, 1.9, nan, nan],
       [5. , 3. , 1.6, 0.2, nan],
       [5. , 3.4, 1.6, 0.4, nan],
       [5.2, 3.5, 1.5, 0.2, nan],
       [5.2, 3.4, nan, 0.2, nan],
       [nan, 3

In [22]:
# how many nan members exist
np.isnan(iris_2d[:, 0]).sum() # we could derive a strategy to interpolate values for these missing - or just drop them

4

In [35]:
# in this data, column 1 represents stamen, column 2 is sepal and column 3 is petal
# see if there is any correlation betwen data points for sepal and petal
condition = (iris_2d[:,2]>1.5) & (iris_2d[:, 3]<5.0) # these are known delimiters for iris species
iris_2d[condition]
# df_iris[condition] # we miss the top row here...

array([[5.4, 3.9, 1.7, 0.4, nan],
       [4.8, 3.4, 1.6, 0.2, nan],
       [5.7, 3.8, 1.7, 0.3, nan],
       [5.4, 3.4, 1.7, 0.2, nan],
       [5.1, 3.3, 1.7, 0.5, nan],
       [4.8, 3.4, 1.9, 0.2, nan],
       [5. , 3. , 1.6, 0.2, nan],
       [5. , 3.4, 1.6, 0.4, nan],
       [4.7, 3.2, 1.6, 0.2, nan],
       [4.8, 3.1, 1.6, 0.2, nan],
       [5. , 3.5, 1.6, 0.6, nan],
       [5.1, 3.8, 1.9, 0.4, nan],
       [5.1, 3.8, 1.6, 0.2, nan],
       [7. , 3.2, 4.7, 1.4, nan],
       [6.4, 3.2, 4.5, 1.5, nan],
       [6.9, 3.1, 4.9, 1.5, nan],
       [5.5, 2.3, 4. , 1.3, nan],
       [6.5, 2.8, 4.6, 1.5, nan],
       [5.7, 2.8, 4.5, 1.3, nan],
       [6.3, 3.3, 4.7, 1.6, nan],
       [4.9, 2.4, 3.3, 1. , nan],
       [6.6, 2.9, 4.6, 1.3, nan],
       [5.2, 2.7, 3.9, 1.4, nan],
       [5. , 2. , 3.5, 1. , nan],
       [5.9, 3. , 4.2, 1.5, nan],
       [6. , 2.2, 4. , 1. , nan],
       [6.1, 2.9, 4.7, 1.4, nan],
       [5.6, 2.9, 3.6, 1.3, nan],
       [6.7, 3.1, 4.4, 1.4, nan],
       [5.6, 3

In [36]:
# drop all the NaN values (data cleaning)

In [45]:
all_nan = np.array([~np.any(np.isnan(row)) for row in iris_2d])
iris_2d[all_nan]
# iris_2d[np.sum(np.isnan(iris_2d), axis=1)==0][:5]

array([], shape=(0, 5), dtype=float64)

In [50]:
# find any correlation between sepals and petals
np.corrcoef(iris_2d[:, 0], iris_2d[:, 2])[0,1] # yes they are statistically significant

0.8717541573048718