In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("property data.csv")

print(df.head(10))

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000.0     NaN   LEXINGTON            N          NaN        1   850
3  100004000.0   201.0    BERKELEY           12            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y          NaN        1   800
6  100007000.0     NaN  WASHINGTON          NaN            2   HURLEY   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y           na        2  1800


In [2]:
print(df["ST_NUM"])
print(df["ST_NUM"].isnull())


0    104.0
1    197.0
2      NaN
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64
0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
Name: ST_NUM, dtype: bool


In [3]:
print(df["NUM_BEDROOMS"])
print(df["NUM_BEDROOMS"].isnull())

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object
0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool


In [4]:
print(df["OWN_OCCUPIED"])
print(df["OWN_OCCUPIED"].isnull())

0      Y
1      N
2      N
3     12
4      Y
5      Y
6    NaN
7      Y
8      Y
Name: OWN_OCCUPIED, dtype: object
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
Name: OWN_OCCUPIED, dtype: bool


In [5]:
#Detecting wrong entry in OWN_OCCUPIED 
cnt=0
for row in df["OWN_OCCUPIED"]:
    try:
        int(row)
        df.loc[cnt,"OWN_OCCUPIED"]=np.nan
    except ValueError:
        pass
    cnt+=1

In [6]:
print(df["OWN_OCCUPIED"])

0      Y
1      N
2      N
3    NaN
4      Y
5      Y
6    NaN
7      Y
8      Y
Name: OWN_OCCUPIED, dtype: object


In [7]:
#Detecting wrong entry in NUM_BATH (non numeric)
cnt=0
for row in df["NUM_BATH"]:
    try:
        float(row)
        pass
    except ValueError:
        df.loc[cnt,"NUM_BATH"]=np.nan
    cnt+=1

In [8]:
print(df["NUM_BATH"])

0      1
1    1.5
2      1
3    NaN
4      2
5      1
6    NaN
7      1
8      2
Name: NUM_BATH, dtype: object


In [9]:
#To summarize all missing value
print(df.isnull().sum())

PID             1
ST_NUM          2
ST_NAME         0
OWN_OCCUPIED    2
NUM_BEDROOMS    2
NUM_BATH        2
SQ_FT           1
dtype: int64


In [10]:
#to detect if in your data frame has missing value
print(df.isnull().values.any())

True


In [11]:
#count how many missing value you have
print(df.isnull().sum().sum())

10


In [27]:
#REPLACING the missing VALUE
#using fillna
print(df["ST_NUM"])

0    104.0
1    197.0
2      NaN
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64


In [12]:
df["ST_NUM"].fillna(123,inplace=True)
print(df["ST_NUM"])

0    104.0
1    197.0
2    123.0
3    201.0
4    203.0
5    207.0
6    123.0
7    213.0
8    215.0
Name: ST_NUM, dtype: float64


In [13]:
df.loc[2,"ST_NUM"] = 124
print(df["ST_NUM"])

0    104.0
1    197.0
2    124.0
3    201.0
4    203.0
5    207.0
6    123.0
7    213.0
8    215.0
Name: ST_NUM, dtype: float64


In [31]:
print(df["NUM_BEDROOMS"])

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object


In [14]:
#Replace "na" with Nan
cnt=0
for row in df["NUM_BEDROOMS"]:
    if row == "na":
        df.loc[cnt,"NUM_BEDROOMS"]=np.nan
    else:
        pass
    cnt+=1

In [15]:
print(df["NUM_BEDROOMS"])

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8    NaN
Name: NUM_BEDROOMS, dtype: object


In [16]:
#Replace the missing value with the median value of that column
median_NUM_BEDROOMS = df["NUM_BEDROOMS"].median().round()
df["NUM_BEDROOMS"].fillna(median_NUM_BEDROOMS,inplace = True)

In [17]:
print(df["NUM_BEDROOMS"])

0    3
1    3
2    2
3    1
4    3
5    2
6    2
7    1
8    2
Name: NUM_BEDROOMS, dtype: object


In [18]:
print(df.head(10))

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000.0   124.0   LEXINGTON            N            2        1   850
3  100004000.0   201.0    BERKELEY          NaN            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y            2        1   800
6  100007000.0   123.0  WASHINGTON          NaN            2      NaN   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y            2        2  1800


In [19]:
#treat this col SQ_FT
#Replace "--" with Nan
cnt=0
for row in df["SQ_FT"]:
    if row == "--":
        df.loc[cnt,"SQ_FT"]=np.nan
    else:
        pass
    cnt+=1
print(df.head(10))

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5   NaN
2  100003000.0   124.0   LEXINGTON            N            2        1   850
3  100004000.0   201.0    BERKELEY          NaN            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y            2        1   800
6  100007000.0   123.0  WASHINGTON          NaN            2      NaN   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y            2        2  1800


In [36]:
#changin the SQ_FT col from object to numeric
df["SQ_FT"] = pd.to_numeric(df["SQ_FT"], downcast="float") 

#calculating the mean or mode of the SQ_FT col
mean_SQ_FT = df["SQ_FT"].mean()
mode_SQ_FT = df["SQ_FT"].mode()

#to replace NaN value with mean or mode
df["SQ_FT"].fillna(mode_SQ_FT,inplace = True)

print(df["SQ_FT"])

0    1000.0
1     950.0
2     850.0
3     700.0
4    1600.0
5     800.0
6     950.0
7     950.0
8    1800.0
Name: SQ_FT, dtype: float32


In [40]:
#To replace Nan in PID col with 100005000
df['PID'].fillna(100005000,inplace = True)
print(df.head(10))

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH   SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000.0
1  100002000.0   197.0   LEXINGTON            N            3      1.5   950.0
2  100003000.0   124.0   LEXINGTON            N            2        1   850.0
3  100004000.0   201.0    BERKELEY          NaN            1      NaN   700.0
4  100005000.0   203.0    BERKELEY            Y            3        2  1600.0
5  100006000.0   207.0    BERKELEY            Y            2        1   800.0
6  100007000.0   123.0  WASHINGTON          NaN            2      NaN   950.0
7  100008000.0   213.0     TREMONT            Y            1        1   950.0
8  100009000.0   215.0     TREMONT            Y            2        2  1800.0
