In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Craete a Dataframe having some missing values

dict1 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,np.nan,32],
         'City':[None,'Hyd','Pune','Blr']
        }

df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [4]:
df1.dtypes

Names     object
Age      float64
City      object
dtype: object

In [5]:
df1.to_csv('missing_value_data1.csv',index = False)

In [6]:
# np.nan == None
dict2 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,None,32],
         'City':[None,'Hyd','Pune','Blr']
        }

df2 = pd.DataFrame(dict2)
df2

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [7]:
df2.dtypes

Names     object
Age      float64
City      object
dtype: object

In [21]:
dict3 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,None,'32'],
         'City':[None,'Hyd','Pune','Blr']
        }

df3 = pd.DataFrame(dict3)
df3

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [10]:
df3.dtypes

Names    object
Age      object
City     object
dtype: object

In [22]:
df3['Age'] = df3['Age'].astype('int')

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [15]:
dict4 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,'31','32'],
         'City':[None,'Hyd','Pune','Blr']
        }

df4 = pd.DataFrame(dict4)
df4

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,31,Pune
3,,32,Blr


In [16]:
df4.dtypes

Names    object
Age      object
City     object
dtype: object

In [17]:
df4['Age'] = df4['Age'].astype('int')

In [18]:
df4.dtypes

Names    object
Age       int32
City     object
dtype: object

- If you want to change tghe datatype of a column use astype

- But if a column has missing values astype will fail

In [23]:
dict5 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,np.nan,'32'],
         'City':[None,'Hyd','Pune','Blr']
        }

df5 = pd.DataFrame(dict5)
df5

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [25]:
df5['Age'] = df5['Age'].astype('int')

ValueError: cannot convert float NaN to integer

In [27]:
# df5 : ValueError: cannot convert float NaN to integer   
# df3: TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

# np.nan === int or float
# None ==== str int float
# NonType : represent as None

- np.nan only represents only a number that number can be an int or can be a float

- None represents str,bytes,float,int

**Method-1**

In [48]:
dict1 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,np.nan,'32'],
         'City':[None,'Hyd','Pune','Blr']
        }

df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [4]:
df1.isnull()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,False,True,False
3,True,False,False


In [5]:
df1.isna()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,False,True,False
3,True,False,False


In [49]:
df1.isnull().sum()
# It will give column wise missing values

Names    1
Age      1
City     1
dtype: int64

In [50]:
# The total values in first column are: 4
# In that 1 value is missed
# So what could be the percentage: 1/4 = 25%
df1.isnull().sum()*100/len(df1)


Names    25.0
Age      25.0
City     25.0
dtype: float64

In [11]:
df1.fillna(40)

Unnamed: 0,Names,Age,City
0,Tarun,30,40
1,Ashish,31,Hyd
2,Suri,40,Pune
3,40,32,Blr


In [16]:
df1['Names'].fillna('Aymen')

# values = {'Age':20}
# df1.fillna(value = values)

0     Tarun
1    Ashish
2      Suri
3     Aymen
Name: Names, dtype: object

In [46]:
values = {'Age':20}
df1.fillna(value = values)

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,20,Hyd
2,Suri,20,Pune
3,,32,Blr


In [19]:
df1['City'].fillna('Chennai')

# until we save we cannot see updated dataframe

0    Chennai
1        Hyd
2       Pune
3        Blr
Name: City, dtype: object

**Method-2**

- We have some methods

    - bfill
 
    - ffill
 
    - backfill
 
    - pad

In [23]:
df1.fillna(method='bfill')

  df1.fillna(method='bfill')


Unnamed: 0,Names,Age,City
0,Tarun,30,Hyd
1,Ashish,31,Hyd
2,Suri,32,Pune
3,,32,Blr


- bfill means before fill 

- It will fill with the next value

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
df1.fillna(method='bfill')

Unnamed: 0,Names,Age,City
0,Tarun,30,Hyd
1,Ashish,31,Hyd
2,Suri,32,Pune
3,,32,Blr


In [26]:
df1.fillna(method='ffill')

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,31,Pune
3,Suri,32,Blr


In [27]:
df1.fillna(method='backfill')

Unnamed: 0,Names,Age,City
0,Tarun,30,Hyd
1,Ashish,31,Hyd
2,Suri,32,Pune
3,,32,Blr


In [28]:
df1.fillna(method='pad')

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,31,Pune
3,Suri,32,Blr


- backfill = bfill

- fill = pad

- One more argument is there axis

- By default axis = 0

- can you give axis = 1 and repeat bfill and ffill

In [29]:
df1.fillna(method='bfill',axis = 1)

Unnamed: 0,Names,Age,City
0,Tarun,30,
1,Ashish,31,Hyd
2,Suri,Pune,Pune
3,32,32,Blr


In [30]:
df1.fillna(method='ffill',axis = 1)

Unnamed: 0,Names,Age,City
0,Tarun,30,30
1,Ashish,31,Hyd
2,Suri,Suri,Pune
3,,32,Blr


**Method-3**

- We are filling the values randomly

- We can do a specific way to fill

- Mean

- Median 

- Mode

In [32]:
dict1 = {'Names':['Tarun','Ashish','Suri',None],
        'Age':[30,31,np.nan,32],
         'City':[None,'Hyd','Pune','Blr']
        }

df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,Names,Age,City
0,Tarun,30.0,
1,Ashish,31.0,Hyd
2,Suri,,Pune
3,,32.0,Blr


In [35]:
mean_age = df1['Age'].mean()

In [38]:
df1['Age'].fillna(mean_age)


0    30.0
1    31.0
2    31.0
3    32.0
Name: Age, dtype: float64

**KNN imputer**

- KNN means: K nearest Neighbours

- Where K is a number

- Instaed of taking all the numbers average

- It considers K neighbours

- How can we know about the neighbours

- Based on distance metric we will calculate the distances

- User will decide how many neighbours need to choose

- k = hyper parameter

- if k = 6 , means it will take top 5 neighbours , which means

- those 5 cobservations has less distance

- will calculate average of those observations value

- Package Name: SKlearn

    - Class Name : Preprocessing

        - method name : SS,MMS,PT

    - Class Name : Impute

        - method name: KNNImpute

In [42]:
from sklearn.impute import KNNImputer
ki = KNNImputer()
ki.fit_transform(df1[['Age']])

array([[30.],
       [31.],
       [31.],
       [32.]])

In [59]:
import numpy as np
from sklearn.impute import KNNImputer
X = [[1,     2, np.nan, 4], 
     [3,     4,    3,   np.nan], 
     [np.nan,6,    5,     3],
     [np.nan,     8,    7,      2],
    [4,     8,    7,      2],
    [6,     8,    7,      2],
    [8,     8,    7,      2],
     [8,     8,    7,      2]]
imputer = KNNImputer(n_neighbors=3)
imputer.fit_transform(X)

array([[1., 2., 5., 4.],
       [3., 4., 3., 3.],
       [6., 6., 5., 3.],
       [6., 8., 7., 2.],
       [4., 8., 7., 2.],
       [6., 8., 7., 2.],
       [8., 8., 7., 2.],
       [8., 8., 7., 2.]])

In [60]:
 a = [1, 2, 3]

b = (a,)

a.append(4)

print(b)

([1, 2, 3, 4],)


In [62]:
x = 5

y = 10

z = 0

z += x > y and y or x

print(z)

5


In [69]:
def decorator(func):

    def wrapper(*args, **kwargs):

        return func(*args, **kwargs) + 1
        
    return wrapper

 

@decorator

def add(x, y):

    return x + y

 

result = add(2, 3)

print(result)

6


In [67]:
def my_generator():

    yield 1
    
    yield 2
    
    yield 3

gen = my_generator()

result = sum(gen)

print(result)

6
