In [1]:
import pandas as pd

In [2]:
df_ref = pd.read_csv("data/bengaluru_house_data.csv")
df_ref.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df_ref.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [4]:
# Checking for df_ref shape (Row, Columns) in reference DataFrame
df_ref.shape

(13320, 9)

In [5]:
df_a = df_ref.drop(labels = ['area_type', 'availability', 'society', 'balcony', 'price'], axis = 'columns')
df_a.head()

Unnamed: 0,location,size,total_sqft,bath
0,Electronic City Phase II,2 BHK,1056,2.0
1,Chikka Tirupathi,4 Bedroom,2600,5.0
2,Uttarahalli,3 BHK,1440,2.0
3,Lingadheeranahalli,3 BHK,1521,3.0
4,Kothanur,2 BHK,1200,2.0


In [6]:
# Checking for featurwise null values 
df_a.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
dtype: int64

In [7]:
# Drop rows contain NA value
df_b = df_a.dropna(axis = 'index')
df_b.head()

Unnamed: 0,location,size,total_sqft,bath
0,Electronic City Phase II,2 BHK,1056,2.0
1,Chikka Tirupathi,4 Bedroom,2600,5.0
2,Uttarahalli,3 BHK,1440,2.0
3,Lingadheeranahalli,3 BHK,1521,3.0
4,Kothanur,2 BHK,1200,2.0


In [8]:
df_b.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
dtype: int64

In [9]:
# Checking for df_ref shape (Row, Columns) in df_b DataFrame
df_b.shape

(13246, 4)

In [10]:
# Observing unique values in <size> feature of DataFrame <df_b>
df_b['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [11]:
df_b['bhk'] = df_b['size'].apply(lambda val: val.split(' ')[0])
df_b.drop(labels = ['size'], axis = 1)
df_b.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,location,size,total_sqft,bath,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,4
2,Uttarahalli,3 BHK,1440,2.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,3
4,Kothanur,2 BHK,1200,2.0,2


In [13]:
df_b['bhk'].unique()

array(['2', '4', '3', '6', '1', '8', '7', '5', '11', '9', '27', '10',
       '19', '16', '43', '14', '12', '13', '18'], dtype=object)

In [14]:
# We may observe Scalar values as well as Range Values in <total_sqft> feature
df_b['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [16]:
# Let's build a function to search for Range Values
def serch_range_values(x):
    try:
        float(x)
    except:
        return False
    
    return True        

In [60]:
df_c = df_b[~df_b['total_sqft'].apply(serch_range_values)]
df_c.head()

Unnamed: 0,location,size,total_sqft,bath,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,2
188,KR Puram,2 BHK,1015 - 1540,2.0,2


In [63]:
import numpy as np

In [78]:
def range_values_to_scalar(x):
    
    tokens = x.split('-')

    if len(tokens) == 2:
        
        tokens_float = [float(token) for token in tokens]
        tokens_mean = np.mean(tokens_float)
        
        return tokens_mean
    else:
        try:
            return float(x)
        except:
            return False

In [86]:
df_c['total_sqft_float'] = df_c['total_sqft'].apply(range_values_to_scalar)
df_c.head(n = 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,location,size,total_sqft,bath,bhk,total_sqft_float
30,Yelahanka,4 BHK,2100 - 2850,4.0,4,2475.0
122,Hebbal,4 BHK,3067 - 8156,4.0,4,5611.5
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,2,1073.5


In [87]:
df_c[df_c['total_sqft_float'] == False]

Unnamed: 0,location,size,total_sqft,bath,bhk,total_sqft_float
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,1,False
648,Arekere,9 Bedroom,4125Perch,9.0,9,False
775,Basavanagara,1 BHK,1000Sq. Meter,2.0,1,False
872,Singapura Village,2 BHK,1100Sq. Yards,2.0,2,False
1019,Marathi Layout,1 Bedroom,5.31Acres,1.0,1,False
1086,Narasapura,2 Bedroom,30Acres,2.0,2,False
1400,Chamrajpet,9 BHK,716Sq. Meter,9.0,9,False
1712,Singena Agrahara,3 Bedroom,1500Sq. Meter,3.0,3,False
1743,Hosa Road,3 BHK,142.61Sq. Meter,3.0,3,False
1821,Sarjapur,3 Bedroom,1574Sq. Yards,3.0,3,False


In [88]:
# Examine feature <total_sqft_float> for <False> value
df_d = df_c[df_c['total_sqft_float'] != False]
df_d.head(n = 3)

Unnamed: 0,location,size,total_sqft,bath,bhk,total_sqft_float
30,Yelahanka,4 BHK,2100 - 2850,4.0,4,2475.0
122,Hebbal,4 BHK,3067 - 8156,4.0,4,5611.5
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,2,1073.5


In [89]:
# Examine feature <total_sqft_float> for <False> value, and verify no <False> value exist in <df_d>
df_d[df_d['total_sqft_float'] == False]

Unnamed: 0,location,size,total_sqft,bath,bhk,total_sqft_float
