##### Importing required libraries 

In [None]:
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import re
import numpy as np

#####  Reading the above CSV file

In [None]:
df1=pd.read_csv("House_Data_20.csv")
df1.head()

In [None]:
len(df1)

In [None]:
df1.info()

##### cleaning and keeping only the useful information from Area column

In [None]:
df1['Area(sqft)']  = df1['Area(sqft)'] .str.replace(r'\D', '')
df1['Area(sqft)'] = pd.to_numeric(df1['Area(sqft)'])
df1['Area(sqft)']

#####  Removing the ruppes symbol and "," 

In [None]:

df1['Rent(Rs)'] = df1['Rent(Rs)'].apply(lambda x: x.strip("₹"))
df1["Rent(Rs)"] = df1["Rent(Rs)"].str.replace(",","")
df1["Rent(Rs)"]

In [None]:
df1.head()

In [None]:
#Replacing the null value from the dataset with 0
df1.fillna(0,inplace=True)

In [None]:
df1.info()

In [None]:
df1.drop_duplicates(subset="Description",keep=False,inplace=True)
len(df1)

In [None]:
df1.sort_values(["Latitude","Rent(Rs)"], inplace=True)
df1.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### BHK

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x=df1['BHK'], order=df1["BHK"].value_counts().index)
plt.show()


In [None]:
df1['BHK'].value_counts(normalize=True)*100

In [None]:
df1["BHK"].describe()

### Area

In [None]:
plt.figure(figsize=(15,7))
sns.histplot(df1["Area(sqft)"])
plt.show()

#####  The graph is seems to be right skewed.

In [None]:
pd.options.display.max_colwidth = 2000
x = df1[df1["Area(sqft)"]==0]
x["Description"]

In [None]:
df2 = df1[df1["Area(sqft)"]!=0]

In [None]:
plt.figure(figsize=(15,7))
sns.histplot(np.log(df2["Area(sqft)"]))
plt.show()

### Rent

In [None]:
plt.figure(figsize=(20,10))
sns.histplot(df2["Rent(Rs)"])
plt.show()

##### The Graph is right skewed. So we need to perform transformation to normalize the value.

In [None]:
df2["Rent(Rs)"].unique()

In [None]:
df2["Rent(Rs)"] = df2["Rent(Rs)"].replace({' Lac': '*1e5'}, regex=True).map(pd.eval).astype(int)

In [None]:
plt.figure(figsize=(20,10))
sns.histplot(np.log(df2["Rent(Rs)"]))
plt.show()

### Checking the Price of flats vary with number of Rooms

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x=df2["BHK"], y=df2["Rent(Rs)"])
plt.show()

##### Data shows that 4BHK are more costly than 5BHK😱, many of us can't even have any idea about it🤔

### Checking how the price varies with Area of the house

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x=df2["Area(sqft)"], y=df2["Rent(Rs)"])
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=df2["Area(sqft)"], y=df2["Rent(Rs)"])
plt.show()

##### We can see that there are many outliers. As the area with 400 sqft costs nearly 2.5Lakh😨 and the area with 500 sqft costs approx 75K😐. We will use some methods to remove these outliers.



#####  Treating for outliers in Price with IQR method.

In [None]:
Q1 = df2["Rent(Rs)"].quantile(0.25)
Q3 = df2["Rent(Rs)"].quantile(0.75)
IQR = Q3 - Q1
print("Q1: ",Q1, "Q3:",Q3,"IQR:",IQR)

In [None]:
outliers = [x for x in df2["Rent(Rs)"] if x<(Q1 - (IQR*1.5)) or x >(Q3+ (IQR*1.5))]
len(outliers)

#####  We have find 20 values which are outliers. Now we'll remove them.

In [None]:
removed = [x for x in df2["Rent(Rs)"] if x>=(Q1 - (IQR*1.5)) and x <=(Q3+ (IQR*1.5)) ]
df3= df2[~df2["Rent(Rs)"].isin(outliers)]

#####  Treating for outliers in Area

In [None]:
Q1 = df2["Area(sqft)"].quantile(0.25)
Q3 = df2["Area(sqft)"].quantile(0.75)
IQR = Q3 - Q1
print("Q1: ",Q1, "Q3:",Q3,"IQR:",IQR)

outliers_ar = [x for x in df3["Area(sqft)"] if x<(Q1 - (IQR*1.5)) or x >(Q3+ (IQR*1.5))]
len(outliers)
df4= df3[~df3["Area(sqft)"].isin(outliers_ar)]

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=df4["Area(sqft)"], y=df4["Rent(Rs)"])
plt.show()

#### Now we can see that the graph is quite descent and price varies with area in a nice manner.

In [None]:
df5 = df4[df4["Longitude"]!=0]
df6 = df5[df5["Latitude"]!=0]

In [None]:
df6.plot(kind="scatter", x="Longitude", y="Latitude", alpha=0.4, figsize=(12,12),
    c="Rent(Rs)", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.xlim(12.6,13.40)


#####  We can see from the data that the outer cordinates have less price whereas the center cordinates are more costly. 🤔 🤗 

##### Exporting the above data to CSV file

In [None]:
df6.to_csv("Final CSV for Banglore Rental.csv", index = False)