__Start with importing the necessary packages__

In [None]:
import pandas as pd
from numpy import percentile,quantile
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read the train and test datasets to dataframes 
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
#Display the maximum number of columsn using pandas package
pd.set_option("display.max_columns",500)

In [None]:
#Read the first 5 rows of df
df.head(5)

In [None]:
#Read the first 2 rows of test dataset
df_test.head(2)

In [None]:
# Read the columns present in the dataset
df.columns

In [None]:
#Print the datatypes for feature values in the dataset
df.dtypes

In [None]:
df.shape

In [None]:
#Print the numeric/float features and object/string type variables

num_cols = [feature for feature in df.columns if df[feature].dtypes!='O']
cat_cols = [feature for feature in df.columns if df[feature].dtypes=='O']

In [None]:
print("The numeric columns are : {}".format(num_cols)+"\n"*2+"The categorical columns are : {}".format(cat_cols))

*Describe about the numerical columns using 5 point stats*

In [None]:
df[num_cols].describe().T

*Describe about the categorical variables*

In [None]:
df[cat_cols].describe().T

*Check for any NaN values present in the dataset*

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.isnull(),cmap="viridis")

In [None]:
df.isnull().sum()

_Print the unique categories count in numeric variables_

In [None]:
l1,l2=list(),list()
for ele in num_cols:
    l1.append(ele)
    l2.append(len(df[ele].unique()))
dict1 = {'Numeric_column':l1,"Count_of_unique_vals":l2}
num_uniq = pd.DataFrame(dict1)
num_uniq

In [None]:
#Print the unique values for those numerical features where their respective count is <=10 in dataset
my_dict = {ele:list(df[ele].unique()) for ele in num_cols }
new_dict = {}
for (k,v) in my_dict.items():
    if len(v)<=10:
        new_dict[k]=v
print(new_dict)

In [None]:
for ele in num_cols:
    print("The no. of unique value counts in"+"\n" + "{} are : {}".format(ele,df[ele].value_counts()))

*It seems like there are some outlier values with respect to the numerical feature values*

In [None]:
#EDA
def plot_distplot(val):
    plt.figure(figsize=(10,10))
    sns.distplot(val,hist=True,kde=True,color="blue")
    plt.show()

In [None]:
for ele in num_cols:
    plot_distplot(df[ele])

**After analysis the value counts for each numerical features we drop some rows 
with unrealistic values for those features** 

In [None]:
new_df = df.copy()

In [None]:
new_df.drop(list(new_df[new_df["bathrooms"]>3].index),axis=0,inplace=True)
new_df.drop(list(new_df[new_df["bedroom"]>4].index),axis=0,inplace=True)

In [None]:
new_df.shape

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(new_df.area,color="red",kde=True)
plt.xticks(rotation=90)

In [None]:
# Extreme point 3 std away from mean considered as upper limit
upper_limit = np.mean(df.area)+3*np.std(df.area)
lower_limit = np.percentile(df.area,45)
print(lower_limit,upper_limit)

In [None]:
new_df = new_df[(new_df["area"]>lower_limit) & (new_df["area"]<upper_limit)]
new_df.shape

In [None]:
upper_limit = np.percentile(df.aggDur,95)
lower_limit = 1
new_df = new_df[(new_df.aggDur>=1) &(new_df.aggDur<upper_limit)]
new_df.shape

In [None]:
upper_limit = np.mean(new_df.deposit_amt)+3*np.std(new_df.deposit_amt)
lower_limit = 1100
print(lower_limit,upper_limit)
new_df=new_df[(new_df.deposit_amt>1100) & (new_df.deposit_amt<upper_limit)]
new_df.shape

In [None]:
new_df["total_rooms"]=0
rooms = [ele for ele in new_df.columns if "room" in ele]
new_df["total_rooms"] = new_df["bedroom"]+new_df["bathrooms"]+new_df["no_room"]+new_df["pooja_room"]+\
                        new_df["study_room"]+new_df["store_room"]+new_df["servant_room"]
rooms.remove("total_rooms")
new_df.drop(rooms,axis=1,inplace=True)
new_df.shape

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x=new_df.total_rooms,y=new_df.rent,orient="v",palette="rainbow")


In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x=new_df.aggDur,y=new_df.rent,orient="v",palette="winter")


In [None]:
plt.figure(figsize=(20,20))
sns.boxplot(x=new_df.deposit_amt,orient="v",palette="magma")
plt.xticks(rotation=80)

In [None]:
plt.figure(figsize=(10,10))
plt.hist(new_df.deposit_amt,bins=100,color="red")

In [None]:
plt.figure(figsize=(5,5))
sns.boxplot(new_df.deposit_amt,orient="v",color="orange")

In [None]:
plt.figure(figsize=(10,10))
plt.hist(new_df["mnt_amt"],bins=50,color="green")

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x=new_df.mnt_amt,orient="v",color="purple")
plt.xticks(rotation=60)

In [None]:
plt.figure(figsize=(20,20))
sns.countplot(x=new_df.furnishing,hue=new_df.petfacility,orient="v",palette="rainbow")
plt.xticks(rotation=30)

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(new_df.rent,color="violet",bins=100,kde=True)

In [None]:
lower_limit = np.mean(new_df.rent) - 2*np.std(new_df.rent)
upper_limit = np.mean(new_df.rent) + 2*np.std(new_df.rent)
print(lower_limit,upper_limit)
new_df = new_df[(new_df.rent>lower_limit) & (new_df.rent<upper_limit)]
new_df.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(new_df.corr(),vmax=1,vmin=-1,annot=True,cmap="viridis")

In [None]:
corrmat = new_df.corr()
corrmat

In [None]:
l1,l2=list(),list()
for ele in new_df.columns:
    l1.append(ele)
    l2.append(len(new_df[ele].unique()))
dict1 = {'Numeric_column':l1,"Count_of_unique_vals":l2}
num_uniq = pd.DataFrame(dict1)
num_uniq

## for ele in cat_cols:
    print("\n{} has these many values\n {}\n".format(ele,new_df[ele].value_counts()))

In [None]:
#new_df = new_df.drop(["address"],axis=1)
address_list = list(new_df["address"].values)
address_list[0]
loc_list=[]
for ele in address_list:
    loc_list.append(ele.split(",")[1].lstrip())
new_df["location"] = np.array(loc_list)
new_df.drop(["address"],axis=1,inplace=True)

In [None]:
new_df["facing"].replace(to_replace="No Direction",value=df["facing"].mode()[0],inplace=True)
new_df["propertyage"].replace(to_replace="NO age",value=df["propertyage"].mode()[0],inplace=True)

In [None]:
#Graph based statistics for categorical features
print(cat_cols)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.furnishing,palette="viridis",hue=new_df.avalable_for)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.facing,palette="winter",hue=new_df.avalable_for)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.floor_type,palette="copper",orient="v",hue=new_df.furnishing)
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.propertyage,palette="summer",hue=new_df.furnishing)

In [None]:
plt.figure(figsize=(20,20))
sns.countplot(new_df.location,palette="spring",orient="h")
plt.xticks(rotation=80)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.avalable_for,palette="spring",hue=new_df.furnishing,orient="h")
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.avalable_for,palette="magma",hue=new_df.gate_community)
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(new_df.avalable_for,palette="cividis",hue=new_df.petfacility)
plt.xticks(rotation=30)

In [None]:
new_df["facing"] = new_df["facing"].replace(to_replace=["South-East"],value=["South"],inplace=True)
new_df["facing"] = new_df["facing"].replace(to_replace=["South-West"],value=["South"],inplace=True)
new_df["facing"] = new_df["facing"].replace(to_replace=["North-East"],value=["North"],inplace=True)
new_df["facing"] = new_df["facing"].replace(to_replace=["North-West"],value=["North"],inplace=True)

In [None]:
new_df.facing.value_counts()

In [None]:
new_df["floor_type"] = new_df["floor_type"].replace(to_replace="Not provided",
                                                    value=new_df["floor_type"].mode()[0],inplace=True)

In [None]:
new_df.location.value_counts()[:15]

In [None]:
loc_mean = new_df.groupby(new_df.location)["rent"].mean()
#print(loc_mean.values)
l1,l2 = list(loc_mean.index),list(loc_mean.values)
new_df["location_encoded"] = new_df["location"].replace(to_replace=l1,value=l2)
new_df.location_encoded.head(5)

In [None]:
new_df.drop("location",axis=1,inplace=True)
new_df_copy = new_df.copy()

In [None]:
directions ={"East":1,"West":2,"North":3,"South":4}
new_df_copy["facing"] = new_df_copy["facing"].map(directions)

In [None]:
# Frequency encoding for Monthly Amount Feature 
fe = new_df_copy.groupby('mnt_amt').size()/len(new_df_copy)
new_df_copy["mnt_amt_encoded"] = new_df_copy["mnt_amt"].map(fe)
new_df_copy.drop(["mnt_amt"],axis=1,inplace=True)
new_df_copy.head(4)

In [None]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'floor_type' and maintenance_amount features.
new_df_copy['floor_type']= label_encoder.fit_transform(new_df_copy['floor_type'])
new_df_copy['maintenance_amt']= label_encoder.fit_transform(new_df_copy['maintenance_amt'])  

In [None]:
dict2 = {"Unfurnished":0,"Semifurnished":1,"Furnished":2}
new_df_copy["furnishing"] = new_df_copy["furnishing"].map(dict2)
new_df_copy.furnishing.head(3)

In [None]:
# Frequency encoding for Available_for Feature 
fe = new_df_copy.groupby('avalable_for').size()/len(new_df_copy)
new_df_copy["avalable_for_encoded"] = new_df_copy["avalable_for"].map(fe)
new_df_copy.drop(["avalable_for"],axis=1,inplace=True)
new_df_copy.head(4)

In [None]:
#Label encoding for features like gate_community,pet facility,wheel chair availability and corner_pro
dict3 = {'Yes':1,"No":0,"None":0}
new_df_copy["gate_community"] = new_df_copy["gate_community"].map(dict3)
new_df_copy["corner_pro"] = new_df_copy["corner_pro"].map(dict3)
new_df_copy["wheelchairadption"] = new_df_copy["wheelchairadption"].map(dict3)
new_df_copy["petfacility"] = new_df_copy["petfacility"].map(dict3)
new_df_copy.head(3)

In [None]:
#new_df_copy.propertyage.value_counts()
dict1 = {
    "Under Construction":0,
    "0 to 1 Year Old":1,
    "1 to 5 Year Old":2,
    "5 to 10 Year Old":3,
    "10+ Year Old":4
}
new_df_copy.propertyage = new_df_copy.propertyage.map(dict1)
new_df_copy.head(5)

In [None]:
sns.distplot(np.log(new_df_copy.deposit_amt))
new_df_copy["deposit_amt"] = np.log(new_df_copy.deposit_amt)
new_df_copy.deposit_amt.head(5)

In [None]:
new_df_copy.reset_index(inplace=True)
new_df_copy.head()

In [None]:
new_df_copy.drop(["index"],inplace=True,axis=1)

In [None]:
final_df = new_df_copy.copy()
final_csv =final_df.to_csv("Final_processed_dataset.csv")

In [None]:
# MinMax Scaler is used for normalizing the feature space
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(new_df_copy)

In [None]:
scaled_df = pd.DataFrame(data=scaled_data,columns=new_df_copy.columns)
scaled_df.head()

In [None]:
X_train = scaled_df.drop(["rent"],axis=1)
X_train.head(3)

In [None]:
y_train = scaled_df.loc[:,"rent"]
y_train.head()

In [None]:
#Faking the train_test_split on the training data itself and perform model creation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

In [None]:
X_train1,X_test1,y_train1,y_test1 = train_test_split(X_train,y_train,test_size=0.3,random_state=0)

dt_reg = DecisionTreeRegressor()

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train1,y_train1)

In [None]:
y_lin_pred = lin_reg.predict(X_test1)
print(r2_score(y_test1,y_lin_pred))
print(mean_squared_error(y_test1,y_lin_pred))
print(mean_absolute_error(y_test1,y_lin_pred))

*Ridge regressor with hyperparameter tuning*

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_params ={
    "alpha":[0.1,0.001,0.25,0.5,0.005],
    "solver":["auto","svd","cholesky","lsqr","sparse_cg","sag","saga"]
}


In [None]:
ridge_model = Ridge()
grid_ridge = GridSearchCV(ridge_model,param_grid=grid_params,scoring="neg_mean_squared_error",cv=10,verbose=2)
grid_ridge.fit(X_train1,y_train1)

In [None]:
y_lin_pred = grid_ridge.predict(X_test1)
print(r2_score(y_test1,y_lin_pred))
print(mean_squared_error(y_test1,y_lin_pred))
print(mean_absolute_error(y_test1,y_lin_pred))

*Random forest regressor*

In [None]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train1,y_train1)
y_rf_pred = rf_reg.predict(X_test1)
print(r2_score(y_test1,y_rf_pred))
print(mean_squared_error(y_test1,y_rf_pred))
print(mean_absolute_error(y_test1,y_rf_pred))

*Hyperparameter tuning for Random Forest Regressor*

In [None]:
grid_params = {
    "n_estimators":[100],
     "criterion":["mse","mae"],
    "max_depth": [2,5,10],
    "min_samples_split" : [2,5,10],
    "min_samples_leaf": [2,5,10]
}

In [None]:
grid_rf = GridSearchCV(rf_reg,param_grid=grid_params,verbose=2,cv=10,scoring="neg_median_absolute_error")
grid_rf.fit(X_train1,y_train1)

In [None]:
grid_rf.best_params_

In [None]:
y_rf_pred = grid_rf.predict(X_test1)
print(r2_score(y_test1,y_rf_pred))
print(mean_squared_error(y_test1,y_rf_pred))
print(mean_absolute_error(y_test1,y_rf_pred))

*Decision Tree Regressor*

In [None]:
dt_reg.fit(X_train1,y_train1)
y_dt_pred = dt_reg.predict(X_test1)
print(r2_score(y_test1,y_dt_pred))
print(mean_squared_error(y_test1,y_dt_pred))
print(mean_absolute_error(y_test1,y_dt_pred))

*Hyperparameter tuning for Decision Tree Regressor*

In [None]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    "criterion":["mse","mae","friedman_mse"],
    "max_depth": [2,5,10],
    "min_samples_split" : [2,5],
    "min_samples_leaf": [2,5]
}
grid_dt = GridSearchCV(dt_reg,param_grid=grid_params,scoring="neg_mean_squared_error",
                       cv=10,verbose=2)
grid_dt.fit(X_train1,y_train1)


In [None]:
grid_dt.best_params_

In [None]:
y_dt_pred = grid_dt.predict(X_test1)
print(r2_score(y_test1,y_dt_pred))
print(mean_squared_error(y_test1,y_dt_pred))
print(mean_absolute_error(y_test1,y_dt_pred))

*Use XGBoosting Ensemble technique as well for model creation with hyperparameter tuning*

In [None]:
import xgboost as xgb

In [None]:
xg_reg = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10,booster="gbtree")
xg_reg.fit(X_train1,y_train1)

In [None]:
y_xg_pred = xg_reg.predict(X_test1)
print(r2_score(y_test1,y_xg_pred))
print(mean_squared_error(y_test1,y_xg_pred))
print(mean_absolute_error(y_test1,y_xg_pred))

*SVR Regressor with hyperparameter tuning*

In [None]:
from sklearn.svm import SVR
grid_params ={
    "kernel":["rbf","linear","poly"], 
    "degree":[3],
    "C":[1,2,5,10,50,100], 
    "epsilon":[0.1]
    
}
svr_model = SVR()
grid_svr = GridSearchCV(svr_model,param_grid=grid_params,scoring="neg_mean_squared_error",cv=10,
                        verbose=2)
grid_svr.fit(X_train1,y_train1)

In [None]:
grid_svr.best_params_

In [None]:
y_svr_pred = grid_svr.predict(X_test1)
print(r2_score(y_test1,y_svr_pred))
print(mean_squared_error(y_test1,y_svr_pred))
print(mean_absolute_error(y_test1,y_svr_pred))

In [None]:
sns.distplot(y_test1-y_xg_pred)

In [None]:
plt.figure(figsize=(10,10))
plt.hist([y_test1,y_xg_pred],bins=50,label=["target","target_pred_xg"],histtype="bar")
plt.legend(loc='upper right')

In [None]:
sns.distplot(y_test1-y_rf_pred)

In [None]:
sns.distplot(y_test1-y_dt_pred)

In [None]:
sns.distplot(y_test1-y_lin_pred)

In [None]:
plt.figure(figsize=(10,10))
plt.hist([y_test1,y_lin_pred],bins=50,label=["target","target_pred_ridge"],histtype="bar")
plt.legend(loc='upper right')

In [None]:
import pickle
file = open("rent_rf_grid_regressor_model.pkl","wb")
pickle.dump(grid_rf,file)

In [None]:
df.propertyage.value_counts()