### Data Cleaning

#### Data Loading and Rename

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

df = pd.read_excel('real-estate.xlsx')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


#### Rename columns with inplace


In [213]:
#Renaming Columns with inplace
df.rename(columns = {"NUM_BEDROOMS": "BEDROOMS",  "NUM_BATH":"BATH"},inplace=True) 
df.head()

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,__
2,100003000.0,,LEXINGTON,N,,1.0,850
3,100004000.0,201.0,BERKELEY,12,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2.0,1600


#### Check the NaN Values

In [214]:
df.isnull().values.any()

True

In [215]:
df.isnull().values.sum()

8

In [216]:
#Showing the Null Values Columnwise
null_columns = df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

PID             1
ST_NUM          2
OWN_OCCUPIED    1
BEDROOMS        2
BATH            1
SQ_FT           1
dtype: int64

In [217]:
print(df[df.isnull().any(axis=1)][null_columns].head())

           PID  ST_NUM OWN_OCCUPIED BEDROOMS    BATH SQ_FT
2  100003000.0     NaN            N      NaN       1   850
3  100004000.0   201.0           12        1     NaN   700
4          NaN   203.0            Y        3       2  1600
5  100006000.0   207.0            Y      NaN       1   800
6  100007000.0     NaN          NaN        2  HURLEY   950


In [218]:
#Filling Null Value with specific value (Column wise)
df['PID'].fillna(100005000, inplace=True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [219]:
#Filling Null Value column wise
df.loc[2, 'ST_NUM'] = 197
df.loc[6, 'ST_NUM'] = 208
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [220]:
#Unwanted Value treatment
cnt = 0
for row in df['OWN_OCCUPIED']:
    try:
        int(row)
        df.loc[cnt, 'OWN_OCCUPIED'] = np.nan
    except ValueError:
        pass
    cnt += 1
df


Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [221]:
df.BEDROOMS = pd.to_numeric(df['BEDROOMS'], errors = 'coerce') 
df.BATH = pd.to_numeric(df['BATH'], errors = 'coerce')
df.SQ_FT = pd.to_numeric(df['SQ_FT'], errors = 'coerce')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,
8,100009000.0,215.0,TREMONT,y,,2.0,1800.0


In [222]:
#Fill the NaN Value using Mode
df['OWN_OCCUPIED'].fillna(df['OWN_OCCUPIED'].mode()[0], inplace = True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,
8,100009000.0,215.0,TREMONT,y,,2.0,1800.0


In [223]:
#Group by parameter check
df.groupby('BEDROOMS')['SQ_FT'].median()

BEDROOMS
1.0     700.0
2.0     950.0
3.0    1300.0
Name: SQ_FT, dtype: float64

In [224]:
# Filling Null with group by parameter
df['SQ_FT'] = df['SQ_FT'].fillna(df.groupby('BEDROOMS')['SQ_FT'].transform('median'))
df['SQ_FT'] = df['SQ_FT'].fillna(df['SQ_FT'].median())
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,y,,2.0,1800.0


In [225]:
df.loc[2,'BEDROOMS'] = 1
df.loc[5,'BEDROOMS'] = 1
df.loc[8,'BEDROOMS'] = 3
df


Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,y,3.0,2.0,1800.0


In [226]:
#Use bfill and ffill
df['BATH'] = df['BATH'].fillna(method='ffill')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,1.0,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,1.0,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,y,3.0,2.0,1800.0


In [227]:
#Type Converstion
df.PID = df.PID.astype('int64') 
df.ST_NUM = df.ST_NUM.astype('int64') 
df.BEDROOMS = df.BEDROOMS.astype('int64') 
df.BATH = df.BATH.astype('int64') 
df.SQ_FT = df.SQ_FT.astype('int64') 
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,PUTNAM,Y,3,1,1000
1,100002000,197,LEXINGTON,N,3,1,1300
2,100003000,197,LEXINGTON,N,1,1,850
3,100004000,201,BERKELEY,Y,1,1,700
4,100005000,203,BERKELEY,Y,3,2,1600
5,100006000,207,BERKELEY,Y,1,1,800
6,100007000,208,WASHINGTON,Y,2,1,950
7,100008000,213,TREMONT,y,1,1,700
8,100009000,215,TREMONT,y,3,2,1800


### Data Transformation

#### Why Data Transofrmation is needed

In [228]:
#Model may be baised
#The scaling of the data increased the performance of the model
#The run time of the model is decreased

#### Level Ecoding

In [229]:
#It's a method of process categoryical data. Using this process, number has been provided to unique value of the categorical variable.
#Sometimes it's called dummy variable also.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['ST_NAME'] = le.fit_transform(df['ST_NAME'])
df


Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,Y,3,1,1000
1,100002000,197,1,N,3,1,1300
2,100003000,197,1,N,1,1,850
3,100004000,201,0,Y,1,1,700
4,100005000,203,0,Y,3,2,1600
5,100006000,207,0,Y,1,1,800
6,100007000,208,4,Y,2,1,950
7,100008000,213,3,y,1,1,700
8,100009000,215,3,y,3,2,1800


#### Mapping Function

In [230]:
#Using Mapping function we can convert numerical data to categorical and also convert categorical data to numberical
#use of mapping function
mapping = {'Y' :1 , 'N' : 2, 'y' : 1 }
df['OWN_OCCUPIED'] = df['OWN_OCCUPIED'].map(mapping)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000
1,100002000,197,1,2,3,1,1300
2,100003000,197,1,2,1,1,850
3,100004000,201,0,1,1,1,700
4,100005000,203,0,1,3,2,1600
5,100006000,207,0,1,1,1,800
6,100007000,208,4,1,2,1,950
7,100008000,213,3,1,1,1,700
8,100009000,215,3,1,3,2,1800


#### One hot Encoder

In [231]:
#One hot Encoder
df1 = pd.get_dummies(df, columns=['OWN_OCCUPIED'])
df1

Unnamed: 0,PID,ST_NUM,ST_NAME,BEDROOMS,BATH,SQ_FT,OWN_OCCUPIED_1,OWN_OCCUPIED_2
0,100001000,104,2,3,1,1000,1,0
1,100002000,197,1,3,1,1300,0,1
2,100003000,197,1,1,1,850,0,1
3,100004000,201,0,1,1,700,1,0
4,100005000,203,0,3,2,1600,1,0
5,100006000,207,0,1,1,800,1,0
6,100007000,208,4,2,1,950,1,0
7,100008000,213,3,1,1,700,1,0
8,100009000,215,3,3,2,1800,1,0


#### Scaling
##### Min Max Normalizer

In [233]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)
scaled

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,0.0,0.0,0.5,0.0,1.0,0.0,0.272727
1,0.125,0.837838,0.25,1.0,1.0,0.0,0.545455
2,0.25,0.837838,0.25,1.0,0.0,0.0,0.136364
3,0.375,0.873874,0.0,0.0,0.0,0.0,0.0
4,0.5,0.891892,0.0,0.0,1.0,1.0,0.818182
5,0.625,0.927928,0.0,0.0,0.0,0.0,0.090909
6,0.75,0.936937,1.0,0.0,0.5,0.0,0.227273
7,0.875,0.981982,0.75,0.0,0.0,0.0,0.0
8,1.0,1.0,0.75,0.0,1.0,1.0,1.0


#### Standard Scaler

In [238]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_scaledstd = pd.DataFrame(ss.fit_transform(df), columns=df.columns)
df_scaledstd



Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.549193,-2.779199,0.312348,-0.534522,1.06066,-0.534522,-0.206195
1,-1.161895,0.09619,-0.390434,1.870829,1.06066,-0.534522,0.589128
2,-0.774597,0.09619,-0.390434,1.870829,-1.06066,-0.534522,-0.603856
3,-0.387298,0.219862,-1.093216,-0.534522,-1.06066,-0.534522,-1.001517
4,0.0,0.281699,-1.093216,-0.534522,1.06066,1.870829,1.38445
5,0.387298,0.405371,-1.093216,-0.534522,-1.06066,-0.534522,-0.73641
6,0.774597,0.43629,1.717911,-0.534522,0.0,-0.534522,-0.338748
7,1.161895,0.59088,1.015129,-0.534522,-1.06066,-0.534522,-1.001517
8,1.549193,0.652717,1.015129,-0.534522,1.06066,1.870829,1.914665


#### Robust Scaler

In [239]:
from sklearn.preprocessing import RobustScaler
robust = RobustScaler()
robust_scaled_df = robust.fit_transform(df)
robust_scaled_df = pd.DataFrame(robust_scaled_df, columns=df.columns)
robust_scaled_df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.0,-9.0,0.333333,0.0,0.5,0.0,0.1
1,-0.75,-0.545455,0.0,1.0,0.5,0.0,0.7
2,-0.5,-0.545455,0.0,1.0,-0.5,0.0,-0.2
3,-0.25,-0.181818,-0.333333,0.0,-0.5,0.0,-0.5
4,0.0,0.0,-0.333333,0.0,0.5,1.0,1.3
5,0.25,0.363636,-0.333333,0.0,-0.5,0.0,-0.3
6,0.5,0.454545,1.0,0.0,0.0,0.0,0.0
7,0.75,0.909091,0.666667,0.0,-0.5,0.0,-0.5
8,1.0,1.090909,0.666667,0.0,0.5,1.0,1.7


#### Transpose Function

In [240]:
#Its a processes of alter the rows into columns and also alter the columns into rows
transpose_df = df.transpose()
transpose_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
PID,100001000,100002000,100003000,100004000,100005000,100006000,100007000,100008000,100009000
ST_NUM,104,197,197,201,203,207,208,213,215
ST_NAME,2,1,1,0,0,0,4,3,3
OWN_OCCUPIED,1,2,2,1,1,1,1,1,1
BEDROOMS,3,3,1,1,3,1,2,1,3
BATH,1,1,1,1,2,1,1,1,2
SQ_FT,1000,1300,850,700,1600,800,950,700,1800


In [2]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [3]:
df.rename(columns={'NUM_BEDROOMS':'BEDROOMS', 'NUM_BATH':'BATH'}, inplace=True)

In [5]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [7]:
df.isnull().values.any()

True

In [8]:
df.isnull().values.sum()

8

In [12]:
df.isnull().any()

PID              True
ST_NUM           True
ST_NAME         False
OWN_OCCUPIED     True
BEDROOMS         True
BATH             True
SQ_FT            True
dtype: bool

In [15]:
null_columns = df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

PID             1
ST_NUM          2
OWN_OCCUPIED    1
BEDROOMS        2
BATH            1
SQ_FT           1
dtype: int64

In [16]:
print(df[df.isnull().any(axis=1)][null_columns])

           PID  ST_NUM OWN_OCCUPIED BEDROOMS    BATH SQ_FT
2  100003000.0     NaN            N      NaN       1   850
3  100004000.0   201.0           12        1     NaN   700
4          NaN   203.0            Y        3       2  1600
5  100006000.0   207.0            Y      NaN       1   800
6  100007000.0     NaN          NaN        2  HURLEY   950
7  100008000.0   213.0            y        1       1   NaN


In [17]:
df['PID'].fillna('100005000.0', inplace=True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [19]:
df.loc[2,'ST_NUM'] = 197.0
df.loc[6,'ST_NUM'] = 208.0
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [23]:
#unwanted value treatment
cnt=0
for row in df['OWN_OCCUPIED']:
    try:
        int(row)
        df.loc[cnt, 'OWN_OCCUPIED']=np.nan
    except ValueError:
        pass
    cnt+=1
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,__
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,y,1,1,
8,100009000.0,215.0,TREMONT,y,na,2,1800


In [25]:
df['BEDROOMS'] = pd.to_numeric(df['BEDROOMS'], errors='coerce')
df['BATH'] = pd.to_numeric(df['BATH'], errors='coerce')
df['SQ_FT'] = pd.to_numeric(df['SQ_FT'], errors='coerce')

In [26]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,
8,100009000.0,215.0,TREMONT,y,,2.0,1800.0


In [28]:
df['OWN_OCCUPIED'].fillna(df['OWN_OCCUPIED'].mode()[0], inplace = True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,
8,100009000.0,215.0,TREMONT,y,,2.0,1800.0


In [29]:
df.groupby('BEDROOMS')['SQ_FT'].median()

BEDROOMS
1.0     700.0
2.0     950.0
3.0    1300.0
Name: SQ_FT, dtype: float64

In [33]:
# Filling Null with group by vparameter
df['SQ_FT'] = df['SQ_FT'].fillna(df.groupby('BEDROOMS')['SQ_FT'].transform('median'))
df['SQ_FT'] = df['SQ_FT'].fillna(df['SQ_FT'].median())
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,y,,2.0,1800.0


In [34]:
df.loc[2,'BEDROOMS'] = 1
df.loc[5,'BEDROOMS'] = 1
df.loc[8,'BEDROOMS'] = 3
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,y,3.0,2.0,1800.0


In [37]:
df['BATH'] = df['BATH'].fillna(method='bfill')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,1300.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,2.0,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,1.0,950.0
7,100008000.0,213.0,TREMONT,y,1.0,1.0,700.0
8,100009000.0,215.0,TREMONT,y,3.0,2.0,1800.0
