In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb


from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
test_df= pd.read_csv("../dataset/test.csv")

In [3]:
test_df.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review
0,0x6e93,Apartment,293,3.0,1,Unfurnished,0.0,No,Once in a day - Morning,7.28,Well above average,Medium,152.0,2.52
1,0x8787,Apartment,586,4.0,1,Semi_Furnished,0.0,No,Once in a day - Evening,7.63,Well below average,Medium,92.0,4.16
2,0x6c17,Container Home,305,1.0,2,Semi_Furnished,1.0,No,All time,5.39,Slightly above average,Medium,90.0,2.92
3,0x9dbd,Apartment,258,2.0,1,Semi_Furnished,1.0,No,All time,7.53,Slightly below average,Medium,158.0,3.45
4,0xbfde,Bungalow,3031,12.0,4,Fully Furnished,0.0,No,All time,8.79,Well above average,High,186.0,2.72


In [4]:
test_df.shape

(10500, 14)

In [5]:
def nan_check(df):
    return round((100*df.isnull().sum())/len(df),2)

In [6]:
nan_check(test_df)

Property_ID               0.00
Property_Type             0.00
Property_Area             0.00
Number_of_Windows         4.24
Number_of_Doors           0.00
Furnishing                2.45
Frequency_of_Powercuts    3.49
Power_Backup              0.00
Water_Supply              0.00
Traffic_Density_Score     0.00
Crime_Rate                2.02
Dust_and_Noise            3.14
Air_Quality_Index         0.00
Neighborhood_Review       0.00
dtype: float64

In [7]:
df2 = test_df.copy()

## Preprocessing Test data

In [8]:
df2.Power_Backup.value_counts()

No               7974
Yes              2305
NOT MENTIONED     221
Name: Power_Backup, dtype: int64

In [9]:
 df2.Power_Backup =df2.Power_Backup.apply(lambda x : None if x=="NOT MENTIONED" else x)

In [10]:
nan_check(df2)

Property_ID               0.00
Property_Type             0.00
Property_Area             0.00
Number_of_Windows         4.24
Number_of_Doors           0.00
Furnishing                2.45
Frequency_of_Powercuts    3.49
Power_Backup              2.10
Water_Supply              0.00
Traffic_Density_Score     0.00
Crime_Rate                2.02
Dust_and_Noise            3.14
Air_Quality_Index         0.00
Neighborhood_Review       0.00
dtype: float64

In [11]:
df2.Furnishing = df2.Furnishing.map({"Unfurnished":0,"Semi_Furnished":1,"Fully Furnished":2})
df2.Crime_Rate = df2.Crime_Rate.map({"Well below average":0,"Slightly below average":1,"Slightly above average":2,"Well above average":3})
df2.Dust_and_Noise = df2.Dust_and_Noise.map({"Low":0,"Medium":1,"High":2})
df2.Power_Backup= df2.Power_Backup.map({"Yes":1,"No":0})

In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [13]:
imputer=IterativeImputer()

In [14]:
df2['Furnishing']=imputer.fit_transform(df2[['Furnishing']])
df2['Crime_Rate']=imputer.fit_transform(df2[['Crime_Rate']])
df2['Dust_and_Noise']=imputer.fit_transform(df2[['Dust_and_Noise']])
df2['Number_of_Windows']=imputer.fit_transform(df2[['Number_of_Windows']])
df2['Frequency_of_Powercuts']=imputer.fit_transform(df2[['Frequency_of_Powercuts']])
df2['Power_Backup']=imputer.fit_transform(df2[['Power_Backup']])

In [15]:
nan_check(df2)

Property_ID               0.0
Property_Type             0.0
Property_Area             0.0
Number_of_Windows         0.0
Number_of_Doors           0.0
Furnishing                0.0
Frequency_of_Powercuts    0.0
Power_Backup              0.0
Water_Supply              0.0
Traffic_Density_Score     0.0
Crime_Rate                0.0
Dust_and_Noise            0.0
Air_Quality_Index         0.0
Neighborhood_Review       0.0
dtype: float64

In [16]:
model_df = pd.read_csv("../dataframes/model_data.csv")

In [17]:
model_df.drop("Habitability_score",axis=1,inplace=True)

In [18]:
selected_features = model_df.columns

In [19]:
df3 = df2[selected_features]
property_id = df2["Property_ID"]

In [20]:
df3.head()

Unnamed: 0,Neighborhood_Review,Furnishing,Crime_Rate,Power_Backup,Frequency_of_Powercuts,Property_Area
0,2.52,0.0,3.0,0.0,0.0,293
1,4.16,1.0,0.0,0.0,0.0,586
2,2.92,1.0,2.0,0.0,1.0,305
3,3.45,1.0,1.0,0.0,1.0,258
4,2.72,2.0,3.0,0.0,0.0,3031


In [21]:
df3["Frequency_of_Powercuts"] = df3["Frequency_of_Powercuts"].astype("int")
df3["Power_Backup"] = df3["Power_Backup"].astype("int")
df3["Crime_Rate"] = df3["Crime_Rate"].astype("int")
df3["Furnishing"] = df3["Furnishing"].astype("int")

In [22]:
df3.head()

Unnamed: 0,Neighborhood_Review,Furnishing,Crime_Rate,Power_Backup,Frequency_of_Powercuts,Property_Area
0,2.52,0,3,0,0,293
1,4.16,1,0,0,0,586
2,2.92,1,2,0,1,305
3,3.45,1,1,0,1,258
4,2.72,2,3,0,0,3031


In [23]:
X= np.array(df3)

### Now , our data is ready 

In [24]:
import pickle

In [25]:
# loading the model

with open('../models/xgboost_model.pickle',"rb") as f1:
    xg_model = pickle.load(f1)

In [26]:
# predicting the data

predictons = xg_model.predict(X)

In [27]:
pred = pd.DataFrame(predictons,columns=["Habitability_score"])
prop_id = pd.DataFrame(property_id,columns=["Property_ID"])

In [29]:
final_df = pd.concat([prop_id,pred],axis=1)

In [30]:
final_df.head()

Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,28.661592
1,0x8787,81.252563
2,0x6c17,67.057922
3,0x9dbd,71.661812
4,0xbfde,81.864555


## Saving

In [31]:
final_df.to_csv(r'../dataframes/predictions.csv',index=False)