In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
def combined_df():
    path = Path("D:/DATA SCIENCE ZONE/Projects/guvi_final_project/Zomato-Data-Analysis-and-Visualization/datasets/city_wise_restaurants_data_set/")
    dfs = []
    for i in path.iterdir():
        df = pd.read_csv(i)
    # Data Cleaning
        df.drop(columns=["Available_Time"], inplace= True) # Droping the 'Available_Time' Column
        df['Average_Cost_for_two'].dropna(inplace=True) # Droping the Null in 'Average_Cost_for_two' Column
        df['Delivery_Votes'] = df['Delivery_Votes'].fillna(0) # Imputing the Null with 0 in 'Delivery_Votes' Column
 
        # Extrating the Price
        df['Average_Cost_for_two'] = df['Average_Cost_for_two'].str.replace('₹', '').str.replace(',', '').str.extract(r'(\d+)').astype(float) 
       
        # Replacing the unwanted Characters
        df['Dining_Ratings'] = df['Dining_Ratings'].replace('-', 0) 
        df['Dining_Ratings'] = df['Dining_Ratings'].replace('New', 0)
        df['Delivery_Ratings'] = df['Delivery_Ratings'].replace('-', 0)
        df['Delivery_Ratings'] = df['Delivery_Ratings'].replace('New', 0)

        # Change data type of ratings columns to float
        df['Dining_Ratings'] = df['Dining_Ratings'].astype(float)
        df['Delivery_Ratings'] = df['Delivery_Ratings'].astype(float)
        
        # Define a function to convert values with 'K' to numeric
        def convert_to_numeric(value):
            if isinstance(value, str):
                if 'K' in value:
                    return float(value.replace('K', '').replace(',', '')) * 1000
                else:
                    return float(value.replace(',', ''))
            else:
                return value


        # Apply the function to the 'Delivery_Votes' and 'Dining_Votes' columns
        df['Dining_Votes'] = df['Dining_Votes'].apply(convert_to_numeric)
        df['Dining_Votes'] = df['Dining_Votes'].astype('int64') # Change data type of Dining_Votes to int
        df['Delivery_Votes'] = df['Delivery_Votes'].apply(convert_to_numeric)
        df['Delivery_Votes'] = df['Delivery_Votes'].astype('int64') # Change data type of Delivery_Votes to int   

        df.dropna(subset =['Average_Cost_for_two'], inplace = True ) # Droping the Null in 'Average_Cost_for_two' Column
        df['Address'] = df['Address'].fillna("Not Available") # Imputing empty address to not available
        df['Contact_Number'] = df['Contact_Number'].fillna("Not Available") # Imputing empty Contact_Number to not available
        df = df.reset_index(drop = True)   # Reseting the index

        # Rearranging The Columns
        columns_order = ['Restaurant_Name','City','Locality','Address','Cuisines','Dining_Ratings','Dining_Votes','Delivery_Ratings','Delivery_Votes','Average_Cost_for_two','Contact_Number','Restaurant_Url']
        df = df.reindex(columns=columns_order) 

        dfs.append(df) # this df will append in the "dfs" list

    combined_df = pd.concat(dfs, ignore_index=True)    #Joining all the appended 'dfs'

    # Function to define rating as text for easy understanding
    def rating_text(ratings):
        if ratings <2.5 and ratings > 0:
            return "Poor"
        elif ratings >=2.5 and ratings <3.5:
            return "Average"
        elif ratings >= 3.5 and ratings <4:
            return "Good"
        elif ratings >= 4 and ratings < 4.5 :
            return "Very Good"
        elif ratings >= 4.5:
            return "Excellent"
        else:
            return "Not rated"       
    
    # Applying Rating Text Function to create the new column
    combined_df["Delivery_Rating_Text"] =  combined_df['Delivery_Ratings'].apply(rating_text)
    combined_df["Dining_Rating_Text"] =  combined_df['Dining_Ratings'].apply(rating_text)

    # Function to define price as category for easy understanding    
    def cost_category(cost):
        if cost <= 300: 
            return "Lower-Priced"
        elif cost <= 600: 
            return "Budget-Friendly"
        elif cost <= 900: 
            return "Affordable"
        elif cost <= 1500:
            return "Moderate"
        elif cost <= 3000:
            return "Semi-Expensive"      
        elif cost <= 6000:  
            return "Expensive"      
        else: 
            return "Luxurious" 

    # Apply the 'cost_category' function to create the new column
    combined_df['Cost_Category'] = combined_df['Average_Cost_for_two'].apply(cost_category)       
   
   # This column will tell that wheather online delivery is available in the restaurant or not
    combined_df["Has_Online_Delivery"] = combined_df.Delivery_Votes.apply(lambda x: "No" if x == 0 else "Yes") 

    # Droping Duplicates
    combined_df.drop_duplicates(inplace= True)    

    combined_df = combined_df.reset_index(drop = True)   # Reseting the index
   
    # Rearranging The Columns
    final_columns_order = ['Restaurant_Name','City','Locality','Address','Cuisines','Dining_Ratings','Dining_Rating_Text','Dining_Votes','Delivery_Ratings','Delivery_Rating_Text','Delivery_Votes',"Has_Online_Delivery",'Average_Cost_for_two','Cost_Category','Contact_Number','Restaurant_Url']
    combined_df = combined_df.reindex(columns=final_columns_order) 
    combined_df.rename(columns={"Average_Cost_for_two" : "Average_Cost_For_Two"}, inplace = True)
    return combined_df
 

In [3]:
df = combined_df()

In [71]:
df.to_csv(r"D:\DATA SCIENCE ZONE\Projects\guvi_final_project\Zomato-Data-Analysis-and-Visualization\datasets\cleaned_final_df.csv", index = False)