In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'iris.data', header = None)
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Step 4. Create columns for the dataset

In [3]:
# 1. sepal_length (in cm)
# 2. sepal_width (in cm)
# 3. petal_length (in cm)
# 4. petal_width (in cm)
# 5. class
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Step 5. Is there any missing value in the dataframe?

In [4]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

# Step 6. Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [5]:
df.iloc[10:29, 3:4] = np.nan
df.iloc[10:29, 3:4]

Unnamed: 0,petal_width
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,


In [6]:
df.isnull().sum()

sepal_length     0
sepal_width      0
petal_length     0
petal_width     19
class            0
dtype: int64

# Step 7. Good, now lets substitute the NaN values to 1.0

In [7]:
df.fillna(1.0, inplace = True)
df.iloc[10:29]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
10,5.4,3.7,1.5,1.0,Iris-setosa
11,4.8,3.4,1.6,1.0,Iris-setosa
12,4.8,3.0,1.4,1.0,Iris-setosa
13,4.3,3.0,1.1,1.0,Iris-setosa
14,5.8,4.0,1.2,1.0,Iris-setosa
15,5.7,4.4,1.5,1.0,Iris-setosa
16,5.4,3.9,1.3,1.0,Iris-setosa
17,5.1,3.5,1.4,1.0,Iris-setosa
18,5.7,3.8,1.7,1.0,Iris-setosa
19,5.1,3.8,1.5,1.0,Iris-setosa


# Step 8. Now let's delete the column class

In [8]:
df.drop('class', axis = 1, inplace = True)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Step 9. Set the first 3 rows as NaN

In [9]:
df[0:3] = np.nan
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,,,,
1,,,,
2,,,,
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Step 10. Delete the rows that have NaN

In [10]:
df.dropna(inplace = True)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2


# Step 11. Reset the index so it begins with 0 again

In [11]:
df.reset_index(inplace = True)
df.head()

Unnamed: 0,index,sepal_length,sepal_width,petal_length,petal_width
0,3,4.6,3.1,1.5,0.2
1,4,5.0,3.6,1.4,0.2
2,5,5.4,3.9,1.7,0.4
3,6,4.6,3.4,1.4,0.3
4,7,5.0,3.4,1.5,0.2
