# Hyderabad House Predictions

In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv("data.csv")

In [3]:
 housing.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,6968000,1340,Nizampet,,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1,29000000,3498,Hitech City,4.0,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,6590000,1318,Manikonda,2.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5739000,1295,Alwal,3.0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5679000,1145,Kukatpally,2.0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
housing=housing.drop('Location', axis=1)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518 entries, 0 to 2517
Data columns (total 39 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Price                2518 non-null   int64  
 1   Area                 2518 non-null   int64  
 2   No. of Bedrooms      2513 non-null   float64
 3   Resale               2518 non-null   int64  
 4   MaintenanceStaff     2518 non-null   int64  
 5   Gymnasium            2518 non-null   int64  
 6   SwimmingPool         2518 non-null   int64  
 7   LandscapedGardens    2518 non-null   int64  
 8   JoggingTrack         2518 non-null   int64  
 9   RainWaterHarvesting  2518 non-null   int64  
 10  IndoorGames          2518 non-null   int64  
 11  ShoppingMall         2518 non-null   int64  
 12  Intercom             2518 non-null   int64  
 13  SportsFacility       2518 non-null   int64  
 14  ATM                  2518 non-null   int64  
 15  ClubHouse            2518 non-null   i

#### housing.describe()

In [5]:
# housing['Sofa'].value_counts()


In [6]:
# %matplotlib inline

In [7]:
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(20,15))

In [8]:
# plt.show()

# Train-Test Splitting

In [9]:
import numpy as np


def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices=shuffled[:test_set_size]
    train_indices=shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
    

In [10]:
#  train_set, test_set = split_train_test(housing, 0.2)

In [11]:
# print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

In [12]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")


Rows in train set: 2014
Rows in test set: 504



In [13]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['Sofa']):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

In [14]:
strat_train_set['Sofa'].value_counts()

0    1862
1      85
9      67
Name: Sofa, dtype: int64

In [15]:
# strat_test_set['Sofa'].value_counts()

In [16]:
1862/85/67


0.32695346795434593

In [17]:
466/21/17


1.3053221288515406

In [18]:
housing=strat_train_set.copy()

# Looking for correlations

In [19]:
corr_matrix=housing.corr()
corr_matrix['Price'].sort_values(ascending=False)

Price                  1.000000
Area                   0.876548
No. of Bedrooms        0.652015
Resale                 0.228216
SwimmingPool           0.080278
Gymnasium              0.072973
JoggingTrack           0.066517
ClubHouse              0.064404
Children'splayarea     0.064325
24X7Security           0.062949
LandscapedGardens      0.062268
IndoorGames            0.060208
CarParking             0.058665
MultipurposeRoom       0.051368
RainWaterHarvesting    0.051135
VaastuCompliant        0.048665
Intercom               0.045336
PowerBackup            0.040013
LiftAvailable          0.031290
SportsFacility         0.018273
StaffQuarter           0.016691
MaintenanceStaff       0.008594
Cafeteria              0.004425
ATM                    0.002491
AC                     0.000116
TV                    -0.008281
DiningTable           -0.009037
Refrigerator          -0.009492
BED                   -0.010304
Sofa                  -0.010726
Gasconnection         -0.011228
WashingM

In [20]:
# from pandas.plotting import scatter_matrix
# attributes =["Area", "No. of Bedrooms", "Price", "GolfCourse"]
# scatter_matrix(housing[attributes], figsize=(12,8))
    

In [21]:
# housing.plot(kind="scatter", x="No. of Bedrooms", y="Price", alpha=1)

In [22]:
housing=strat_train_set.drop("Price", axis=1)
housing_labels=strat_train_set["Price"].copy()

# Missing Attributes



In [23]:
# To take care of missing attributes, you have 3 options:
#     1.get rid of missing data points
#     2. get rid of whole attribute
#     3. set the value to some value like 0 mean or median

In [24]:
a=housing.dropna(subset=["No. of Bedrooms"])
a.shape

(2009, 38)

In [25]:
median=housing["No. of Bedrooms"].median()
median

3.0

In [26]:
housing["No. of Bedrooms"].fillna(median)

2172    3.0
1937    2.0
1710    3.0
1229    3.0
2427    3.0
       ... 
860     1.0
295     3.0
2089    2.0
2508    2.0
1686    1.0
Name: No. of Bedrooms, Length: 2014, dtype: float64

In [27]:
housing.shape

(2014, 38)

In [28]:
housing.dropna()

Unnamed: 0,Area,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,RainWaterHarvesting,IndoorGames,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
2172,1800,3.0,1,0,1,1,1,0,0,0,...,1,0,0,0,0,0,1,0,1,0
1937,1200,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1710,1740,3.0,0,0,1,1,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1229,1808,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2427,2260,3.0,0,0,1,1,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,1020,1.0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
295,2000,3.0,0,1,1,1,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2089,1200,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2508,1550,2.0,0,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [29]:
housing.shape


(2014, 38)

In [30]:
housing.describe()

Unnamed: 0,Area,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,RainWaterHarvesting,IndoorGames,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
count,2014.0,2009.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,...,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0
mean,1643.760675,2.619711,0.223436,0.413605,0.737339,0.701589,0.61569,0.558093,0.609235,0.580933,...,0.966733,0.357498,0.60576,0.341112,0.330189,0.321251,0.332175,0.341609,0.339126,0.32423
std,756.38198,0.686908,0.416652,1.624524,1.609407,1.614277,1.6227,1.625783,1.623148,1.624806,...,1.558397,1.620565,1.623378,1.61904,1.61793,1.616966,1.618137,1.619088,1.618843,1.617293
min,500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1160.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1490.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1821.5,3.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9400.0,8.0,1.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0


In [31]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

In [32]:
imputer.statistics_

array([1.49e+03, 3.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00])

In [33]:
X=imputer.transform(housing)

In [34]:
housing_tr=pd.DataFrame(X, columns=housing.columns)

In [35]:
housing_tr.describe()

Unnamed: 0,Area,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,RainWaterHarvesting,IndoorGames,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
count,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,...,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0
mean,1643.760675,2.620655,0.223436,0.413605,0.737339,0.701589,0.61569,0.558093,0.609235,0.580933,...,0.966733,0.357498,0.60576,0.341112,0.330189,0.321251,0.332175,0.341609,0.339126,0.32423
std,756.38198,0.686315,0.416652,1.624524,1.609407,1.614277,1.6227,1.625783,1.623148,1.624806,...,1.558397,1.620565,1.623378,1.61904,1.61793,1.616966,1.618137,1.619088,1.618843,1.617293
min,500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1160.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1490.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1821.5,3.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9400.0,8.0,1.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0


## Scikit-learn design

 Primarily three types of objects
 1. Estimators
 2. Transformers
 3. Predictors

## Feature Scaling

Primarily, two types of feature scaling
1. Min-Max(Normalization)
2. Standardization

## Creating Pipeline

In [36]:
# from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])


In [37]:
housing_num_tr=my_pipeline.fit_transform(housing)

In [38]:
housing_num_tr.shape

(2014, 38)

## Selecting a model for predictions

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model=LinearRegression()
model=DecisionTreeRegressor()
#model=RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)


In [40]:
some_data=housing.iloc[:5]

In [41]:
some_labels=housing_labels.iloc[:5]

In [42]:
prepared_data=my_pipeline.transform(some_data)

In [43]:
model.predict(prepared_data)

array([10500000.        ,  6317142.85714286, 12000000.        ,
        7593000.        , 17000000.        ])

In [44]:
list(some_labels)

[10500000, 4680000, 12000000, 7593000, 17000000]

# Evaluating the model

In [45]:
from sklearn.metrics import mean_squared_error
housing_predictions=model.predict(housing_num_tr)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse=np.sqrt(lin_mse)

In [46]:
lin_rmse

624504.4362819868

# Using better technique- Cross Validation

In [47]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores=np.sqrt(-scores)

In [48]:
list(rmse_scores)

[5749616.423535466,
 4286307.2853050595,
 4829892.933333275,
 4335232.548468733,
 3551564.0615383955,
 4990710.612358387,
 4363044.525054381,
 3664142.6920468076,
 3923382.2748644133,
 5386091.717611292]

In [49]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

In [50]:
print_scores(rmse_scores)

Scores: [5749616.42353547 4286307.28530506 4829892.93333327 4335232.54846873
 3551564.0615384  4990710.61235839 4363044.52505438 3664142.69204681
 3923382.27486441 5386091.71761129]
Mean: 4507998.507411621
Standard Deviation: 686872.0470659263


# Testing the model on test data

In [51]:
X_test=strat_test_set.drop("Price", axis=1)
Y_test=strat_test_set["Price"].copy()
X_test_prepared=my_pipeline.transform(X_test)
final_predictions=model.predict(X_test_prepared)
final_mse=mean_squared_error(Y_test, final_predictions)
final_rmse=np.sqrt(final_mse)
print(final_predictions, list(Y_test))

[ 4400000.         35000000.          4364416.66666667  3900000.
  8749000.          4093684.21052632  7800000.         35200000.
 17000000.          4204200.          6000000.          6266666.66666667
 15300000.          8325000.         10100000.          7200000.
  4400000.          5500000.          6675000.         10800000.
  7400000.          4299000.          3500000.          8140000.
  8775000.         10700000.         16000000.          4400000.
  9499000.          5777000.         10100000.          8000000.
  2104000.         11000000.          5720000.         11000000.
  3830000.          5363500.          8903000.         10800000.
  4216000.         12100000.          5500000.          3000000.
  6900000.          6323000.          5800000.         11299999.
 45000000.          8140000.         20500000.         11500000.
  5600000.         35200000.          8426000.          8600000.
  3900000.          4400000.          8186666.66666667 31000000.
  8070000.       

In [52]:
final_rmse

6768713.548176292