In [109]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import datasets
import math

# Setup/get data


Get sklearn built in data

In [92]:
dir(sklearn.datasets)[15:20] # print all 
iris = sklearn.datasets.load_iris()
# convert to pandas df
iris = pd.DataFrame(np.concatenate((iris.data, np.array([iris.target]).T), axis=1), columns=iris.feature_names + ['target'])

In [93]:
iris.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

Make some missing

In [94]:
# https://stackoverflow.com/questions/42091018/randomly-insert-nas-values-in-a-pandas-dataframe-with-no-rows-completely-miss
np.random.seed(100)
mask = np.random.choice([True, False], size=df.shape)
mask[mask.all(1),-1] = 0
# print (mask)[0:5]
iris = iris.mask(mask)
print(iris[0:3])

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                NaN               NaN                1.4               0.2   
1                4.9               NaN                NaN               NaN   
2                NaN               3.2                NaN               NaN   

   target  
0     0.0  
1     NaN  
2     NaN  


Clean column names

In [112]:
iris.columns = [c.replace(' ', '_') for c in iris.columns]
iris.rename(columns={'sepal_length_(cm)': 'sepal_length', 
                     'sepal_width_(cm)': 'sepal_width', 
                     'petal_length_(cm)':  'petal_length',
                     'petal_width_(cm)': 'petal_width'}, inplace=True)

## Find missing

isna() == isnull() cuz pandas dfs are based on R DataFrames, where na and null are different, so has both functions. 
<br>but pandas also built on numpy, which has neither na nor null. Instead has NaN.
<br>https://stackoverflow.com/questions/944700/how-can-i-check-for-nan-values
- In pandas, use isna/isnull/notna/notnull.
- In numpy, use isnan/notnan

In [96]:
print(pd.isna(iris['target'])[0:3]) # in pandas, isna == pd.isnull(iris['target'])
print("")
print(pd.notna(iris['target'])[0:3]) # == pd.notnull(iris['target'])
print("")
np.isnan(iris['target'])[0:3] # in numpy, always nan

0    False
1     True
2     True
Name: target, dtype: bool

0     True
1    False
2    False
Name: target, dtype: bool



0    False
1     True
2     True
Name: target, dtype: bool

## Make a new column based on missing: make an index column and append

This method is not very good.

Rows are "qualifying" if length differs from width, and length is not missing.

In [97]:
iris3 = iris

In [104]:
qualifying_index = [
    pd.notna(iris3['sepal_length']) & (iris3['sepal_length'] != iris3['sepal_width'])
]

This is tricky to do. To be appended to the dataset, the index needs to be convered to a one-column Series:

In [105]:
print("original index:")
print(qualifying_index[0:3])
print("")
print("")
print("as a Series: note this has TWO columns!")
print(pd.Series(qualifying_index).loc[0:3])
print("")
print("")
print("as a Series with only the first column: what we want to append:")
print(pd.Series(qualifying_index).loc[0][0:3])

original index:
[0      False
1       True
2      False
3      False
4      False
       ...  
145     True
146     True
147    False
148    False
149     True
Length: 150, dtype: bool]


as a Series: note this has TWO columns!
0    0      False
1       True
2      False
3      ...
dtype: object


as a Series with only the first column: what we want to append:
0    False
1     True
2    False
dtype: bool


In [108]:
iris3['qulaifying']  = pd.Series(qualifying_index).loc[0]
pd.set_option('display.max_rows', 50)
iris3[0:3]

Unnamed: 0,sepal_length,sepal_width,petal_length_(cm),petal_width,target,qulaifying
0,,,1.4,0.2,0.0,False
1,4.9,,,,,True
2,,3.2,,,,False


## Better way to make a new column based on missing: function method
Use lambda and apply.

Rows are "qualifying" if length differs from width, and length is not missing.

In [78]:
iris2 = iris
def qualifying(sepal_length,sepal_width):
    if sepal_length != sepal_width and pd.notna(sepal_length):
        return 1
    else:
        return 0
    
iris2['qualifying'] = iris2.apply(lambda x: qualifying(x['sepal_length'],x['sepal_width']),axis=1)
iris2[0:3]

Unnamed: 0,sepal_length,sepal_width,petal_length_(cm),petal_width,target,qualifying
0,,,1.4,0.2,0.0,0
1,4.9,,,,,1
2,,3.2,,,,0
