In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [26]:
df_train=pd.read_csv("traindata_preprocessed.csv")
df_test=pd.read_csv("testdata_preprocessed.csv")
df_train.head()

Unnamed: 0,Id,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score,Property_Type__#R%$G&867,Property_Type__Apartment,Property_Type__Bungalow,Property_Type__Container Home,Property_Type__Duplex,Property_Type__Single-family home
0,0x21e3,106,3.926254,1,1.0,0.0,0.0,1.0,5.89,1.0,1.0,90.0,3.86,71.98,0,1,0,0,0,0
1,0x68d4,733,2.0,2,0.0,1.0,0.0,1.0,4.37,0.0,1.0,96.0,3.55,71.2,0,1,0,0,0,0
2,0x7d81,737,4.0,2,2.0,0.0,0.0,1.0,7.45,1.0,1.0,121.0,3.81,71.39,0,1,0,0,0,0
3,0x7a57,900,3.0,2,0.0,2.0,1.0,1.0,6.16,3.0,1.0,100.0,1.34,31.46,0,1,0,0,0,0
4,0x9409,2238,14.0,6,2.0,0.0,0.0,2.0,5.46,0.0,1.0,116.0,4.77,93.7,0,0,1,0,0,0


In [27]:
df_train=df_train.drop(["Id"],axis=1)
df_id=df_test["Id"]
df_test=df_test.drop(["Id"],axis=1)
df_id.head()

0    0x6808
1    0x6a98
2    0xacc0
3    0x8225
4    0xaee8
Name: Id, dtype: object

In [28]:
# Splitting the data into training and validating data
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(["Habitability_score"],axis=1),df_train["Habitability_score"],test_size = 0.2,random_state=42)

In [29]:
#creating linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred_valid=regressor.predict(X_valid)
print("R2 score: ",r2_score(y_valid,pd.DataFrame(y_pred_valid)))
print("Mean squared error: ",mean_squared_error(y_valid,pd.DataFrame(y_pred_valid)))
print("Mean absolute error: ",mean_absolute_error(y_valid,pd.DataFrame(y_pred_valid)))

R2 score:  0.5656558346602765
Mean squared error:  82.59157374184574
Mean absolute error:  7.306189793267787


In [30]:
#creating ridge regression model
rdg = Ridge(alpha = 10)
rdg.fit(X_train, y_train)
y_pred_valid_ridge=rdg.predict(X_valid)
print("R2 score: ",r2_score(y_valid,pd.DataFrame(y_pred_valid_ridge)))
print("Mean squared error: ",mean_squared_error(y_valid,pd.DataFrame(y_pred_valid_ridge)))
print("Mean absolute error: ",mean_absolute_error(y_valid,pd.DataFrame(y_pred_valid_ridge)))

R2 score:  0.5657110760525326
Mean squared error:  82.58106946001992
Mean absolute error:  7.302646917166539


In [31]:
#test with the train data itself
y_pred_train_ridge=rdg.predict(X_train)
print("R2 score: ",r2_score(y_train,pd.DataFrame(y_pred_train_ridge)))
print("Mean squared error: ",mean_squared_error(y_train,pd.DataFrame(y_pred_train_ridge)))
print("Mean absolute error: ",mean_absolute_error(y_train,pd.DataFrame(y_pred_train_ridge)))
#R2 score is low eventhough we tested with the train data itself so we need a complex model

R2 score:  0.5804275814601968
Mean squared error:  85.00586316742232
Mean absolute error:  7.451833481344195


In [32]:
# define XG boost model
XGB_model = XGBRegressor()
# fit model
XGB_model.fit(X_train, y_train)
yhat = XGB_model.predict(X_valid)
print("R2 score: ",r2_score(y_valid,pd.DataFrame(yhat)))
print("Mean squared error: ",mean_squared_error(y_valid,pd.DataFrame(yhat)))
print("Mean absolute error: ",mean_absolute_error(y_valid,pd.DataFrame(yhat)))

R2 score:  0.7934902622049638
Mean squared error:  39.26831669113729
Mean absolute error:  4.796103580993942


In [None]:
# Fitting Random Forest Regression to the dataset
ran_for_regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)

# Fit the regressor with x and y data
ran_for_regressor.fit(X_train, y_train)
predict = ran_for_regressor.predict(X_valid)
print("R2 score: ",r2_score(y_valid,pd.DataFrame(predict)))
print("Mean squared error: ",mean_squared_error(y_valid,pd.DataFrame(predict)))
print("Mean absolute error: ",mean_absolute_error(y_valid,pd.DataFrame(predict)))

In [34]:
y_pred = XGB_model.predict(df_test)
df_y = pd.DataFrame(y_pred)
df_y.head(20)

Unnamed: 0,0
0,77.358063
1,77.280182
2,76.363525
3,80.225777
4,73.936134
5,87.747879
6,74.926468
7,72.005341
8,67.116898
9,76.587799


In [23]:
df = pd.concat([df_id,df_y],axis=1)
df = df.rename(columns={df.columns[1]: 'Habitability_score'})
df["Habitability_score"] = df["Habitability_score"].apply(lambda x: round(x, 2))
df.head()

Unnamed: 0,Id,Habitability_score
0,0x6808,77.36
1,0x6a98,77.28
2,0xacc0,76.36
3,0x8225,80.23
4,0xaee8,73.94


In [24]:
df.to_csv('final.csv', index=False)