# Data loading and Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
locations=pd.read_csv('/kaggle/input/zomato-restaurant-info-bengaluru/zomato_locations.csv')
locations.head()

In [None]:
print(locations.info())
print(locations.shape)

# Checking Missing values

In [None]:
sns.set()
sns.heatmap(locations.isnull(), cbar=False, yticklabels=False, cmap='viridis')

we see that there is no missing values in above sns plot.

In [None]:
locations.isnull().sum()

# Exploratory Analysis

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(locations.corr(),cmap='Greens',annot=True)

we see that there is no correlations between given Features.

In [None]:
data_sort=locations.sort_values(by='avg_rating',ascending=False)
data_sort.head()

In [None]:
#histogram of avg_rating
plt.hist(data_sort['avg_rating'],10)

i) we see that there are total 50 locations where highest rating is given for resturants.
ii)There is only one resturants which got zero rating.
ii)Most of resturants having rating is between 3 to 4.5

In [None]:
#
k=pd.pivot_table(data_sort,index='avg_rating',values='count',aggfunc='sum')
k.plot(kind='bar')

i)we see that there are total 12000 resturants having rating is 3.8 and approximately 9000 resturants having rating is 4.1.

In [None]:
sns.catplot(x='avg_rating',kind='box',data=locations)

In [None]:
sns.relplot(x='lon',y='lat',hue='avg_rating',data=locations)

# Linear Regression

In [None]:
import statsmodels.api as sm

In [None]:
y=locations.iloc[:,4].values
x=locations.iloc[:,1:4].values

In [None]:
lin_model=sm.OLS(y,x).fit()
lin_model.summary()

we see that all regressor x1,x2,x3 having p_value is less then 0.05.Hence all three variables are significant.
Also R-squared and Adj.R-squared values are closer to 1. Here R-squared is 0.93 we say that 93% of data fit regression model.

# Residual Analysis

In [None]:
from scipy.stats import shapiro
y_pred=lin_model.predict(x)
residual=y-y_pred
shapiro(residual)

We see that pvalue is less then 0.05,Hence we reject null hypothesis.Hence residuals are Not normally distributed.

In [None]:
plt.hist(residual)

In [None]:
#Residual Analysis
print(np.mean(residual))
plt.plot(residual)

From above analysis we say that model is not satisfying normality assumption. Hence model is not significant for prediction

In [None]:
from sklearn.metrics import mean_squared_error
MSE= mean_squared_error(y,y_pred)
print(r'The MSE is:',MSE)

# K-Means Clustering

In [None]:
data_c=locations.drop(columns=['Name'])
x=locations.iloc[:,1:].values

In [None]:
#feature scaling
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler
#sc=StandardScaler()
#sc=MinMaxScaler()
#x=sc.fit_transform(x)

In [None]:
from sklearn.cluster import KMeans

In [None]:
#Finding optimal number of cluster
wcss=[]
for i in range(1,15):
    km=KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
    km.fit(x)
    wcss.append(km.inertia_)
plt.plot(range(1,15),wcss)
plt.title('Elbow Method')
plt.xlabel('Number of cluster')
plt.ylabel('wcss')
plt.show()

We see that optimal number of cluster are 5.

In [None]:
kmeans=KMeans(n_clusters=4,init="k-means++",n_init=10,max_iter=300,random_state=0)
y_kmeans=kmeans.fit_predict(x)

In [None]:
dd=pd.DataFrame(x,columns=['count','lat','lon','avg_rating'])
dd['cluster'] = y_kmeans
pd.plotting.parallel_coordinates(dd, 'cluster')

In [None]:
#for visualization
plt.scatter(x[y_kmeans==0,0],x[y_kmeans==0,1],s=100,c="red",label="cluster-1")
plt.scatter(x[y_kmeans==1,0],x[y_kmeans==1,1],s=100,c="blue",label="cluster-2")
plt.scatter(x[y_kmeans==2,0],x[y_kmeans==2,1],s=100,c="green",label="cluster-3")
plt.scatter(x[y_kmeans==3,0],x[y_kmeans==3,1],s=100,c="cyan",label="cluster-4")
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=300,c="yellow",label="Centroids")
plt.title("clusters of resturants")
plt.legend()
plt.show()

we see that data is clustered into 4 groups

# AgglomerativeClustering

In [None]:
#for dendogram 
import scipy.cluster.hierarchy as sch
dendogram=sch.dendrogram(sch.linkage(x,method="ward"))
plt.title("Dendrogram")
plt.xlabel("data")
plt.ylabel("Euclidean Distances")
plt.show()

In [None]:
#here we get 5 clusters
#fitting HC_clustering to dataset
from sklearn.cluster import AgglomerativeClustering
hc=AgglomerativeClustering(n_clusters=4,affinity="euclidean",linkage="ward")
y_hc=hc.fit_predict(x)

In [None]:
#for Visualization
plt.scatter(x[y_hc==0,0],x[y_hc==0,1],s=100,c="red",label="cluster-1")
plt.scatter(x[y_hc==1,0],x[y_hc==1,1],s=100,c="blue",label="cluster-2")
plt.scatter(x[y_hc==2,0],x[y_hc==2,1],s=100,c="green",label="cluster-3")
plt.scatter(x[y_hc==3,0],x[y_hc==3,1],s=100,c="cyan",label="cluster-4")
plt.title("clusters of Resturants")
plt.legend()
plt.show()

we see that data is cluster into 4 groups