In [1]:
import pandas as pd

# Load the dataset
file_path = 'corpus.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset and the summary information
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        325 non-null    int64  
 1   Zone                              325 non-null    object 
 2   State                             325 non-null    object 
 3   City                              325 non-null    object 
 4   Name                              325 non-null    object 
 5   Type                              325 non-null    object 
 6   Establishment Year                325 non-null    object 
 7   time needed to visit in hrs       325 non-null    float64
 8   Google review rating              325 non-null    float64
 9   Entrance Fee in INR               325 non-null    int64  
 10  Airport with 50km Radius          325 non-null    object 
 11  Weekly Off                        32 non-null     object 
 12  Signific

(   Unnamed: 0      Zone  State   City                  Name          Type  \
 0           0  Northern  Delhi  Delhi            India Gate  War Memorial   
 1           1  Northern  Delhi  Delhi        Humayun's Tomb          Tomb   
 2           2  Northern  Delhi  Delhi     Akshardham Temple        Temple   
 3           3  Northern  Delhi  Delhi  Waste to Wonder Park    Theme Park   
 4           4  Northern  Delhi  Delhi         Jantar Mantar   Observatory   
 
   Establishment Year  time needed to visit in hrs  Google review rating  \
 0               1921                          0.5                   4.6   
 1               1572                          2.0                   4.5   
 2               2005                          5.0                   4.6   
 3               2019                          2.0                   4.1   
 4               1724                          2.0                   4.2   
 
    Entrance Fee in INR Airport with 50km Radius Weekly Off   Significan

In [2]:
# Remove the 'Unnamed: 0' column as it's an unnecessary index
data_cleaned = data.drop(columns=['Unnamed: 0'])

# Check for missing values in the dataset
missing_values = data_cleaned.isnull().sum()

# Display cleaned data and missing values report
data_cleaned.head(), missing_values


(       Zone  State   City                  Name          Type  \
 0  Northern  Delhi  Delhi            India Gate  War Memorial   
 1  Northern  Delhi  Delhi        Humayun's Tomb          Tomb   
 2  Northern  Delhi  Delhi     Akshardham Temple        Temple   
 3  Northern  Delhi  Delhi  Waste to Wonder Park    Theme Park   
 4  Northern  Delhi  Delhi         Jantar Mantar   Observatory   
 
   Establishment Year  time needed to visit in hrs  Google review rating  \
 0               1921                          0.5                   4.6   
 1               1572                          2.0                   4.5   
 2               2005                          5.0                   4.6   
 3               2019                          2.0                   4.1   
 4               1724                          2.0                   4.2   
 
    Entrance Fee in INR Airport with 50km Radius Weekly Off   Significance  \
 0                    0                      Yes        NaN     Hi

In [3]:
data_cleaned

Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Weekly Off,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit
0,Northern,Delhi,Delhi,India Gate,War Memorial,1921,0.5,4.6,0,Yes,,Historical,Yes,2.60,Evening
1,Northern,Delhi,Delhi,Humayun's Tomb,Tomb,1572,2.0,4.5,30,Yes,,Historical,Yes,0.40,Afternoon
2,Northern,Delhi,Delhi,Akshardham Temple,Temple,2005,5.0,4.6,60,Yes,,Religious,No,0.40,Afternoon
3,Northern,Delhi,Delhi,Waste to Wonder Park,Theme Park,2019,2.0,4.1,50,Yes,Monday,Environmental,Yes,0.27,Evening
4,Northern,Delhi,Delhi,Jantar Mantar,Observatory,1724,2.0,4.2,15,Yes,,Scientific,Yes,0.31,Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,Western,Gujarat,Gandhinagar,Akshardham,Temple,1992,3.0,4.6,0,Yes,Monday,Religious,No,0.18,All
321,Central,Uttar Pradesh,Agra,Agra Fort,Fort,1565,2.0,4.5,40,Yes,,Historical,Yes,1.30,Afternoon
322,Central,Madhya Pradesh,Bhopal,Madhya Pradesh Tribal Museum,Museum,2013,2.0,4.7,10,Yes,Monday,Cultural,Yes,0.15,All
323,Northern,Rajasthan,Jaipur,City Palace,Palace,1727,2.0,4.4,200,Yes,,Historical,Yes,0.51,Morning


In [4]:
# Convert 'Establishment Year' to numeric (integer)
data_cleaned['Establishment Year'] = pd.to_numeric(data_cleaned['Establishment Year'], errors='coerce')

# Normalize text data: convert all text columns to lowercase
text_columns = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[text_columns] = data_cleaned[text_columns].apply(lambda x: x.str.lower())

# Replace missing values in 'Establishment Year' with 1800
data_cleaned['Establishment Year'].fillna(1800, inplace=True)

# Convert 'Establishment Year' back to integer
data_cleaned['Establishment Year'] = data_cleaned['Establishment Year'].astype(int)

# Display the updated dataset to confirm changes
data_cleaned.head(), data_cleaned['Establishment Year'].dtype


(       Zone  State   City                  Name          Type  \
 0  northern  delhi  delhi            india gate  war memorial   
 1  northern  delhi  delhi        humayun's tomb          tomb   
 2  northern  delhi  delhi     akshardham temple        temple   
 3  northern  delhi  delhi  waste to wonder park    theme park   
 4  northern  delhi  delhi         jantar mantar   observatory   
 
    Establishment Year  time needed to visit in hrs  Google review rating  \
 0                1921                          0.5                   4.6   
 1                1572                          2.0                   4.5   
 2                2005                          5.0                   4.6   
 3                2019                          2.0                   4.1   
 4                1724                          2.0                   4.2   
 
    Entrance Fee in INR Airport with 50km Radius Weekly Off   Significance  \
 0                    0                      yes        NaN 

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Selecting categorical columns for one-hot encoding
categorical_columns = ['Zone', 'State', 'City', 'Type', 'Airport with 50km Radius', 'Weekly Off', 'Significance', 'DSLR Allowed', 'Best Time to visit']

# Numerical columns to scale
numerical_columns = ['time needed to visit in hrs', 'Google review rating', 'Entrance Fee in INR', 'Number of google review in lakhs', 'Establishment Year']

# Creating transformers for the pipeline
categorical_transformer = OneHotEncoder(drop='first')
numerical_transformer = StandardScaler()

# Creating the preprocessing pipeline for categorical and numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Fit the preprocessor to the data
preprocessed_data = preprocessor.fit_transform(data_cleaned)

# Show the shape of the processed data to understand the transformation
preprocessed_data.shape, preprocessed_data


((325, 368),
 <325x368 sparse matrix of type '<class 'numpy.float64'>'
 	with 4293 stored elements in Compressed Sparse Row format>)

In [6]:
def recommend_places(data, preferred_type=None, preferred_zone=None, max_budget=None, max_duration=None):
    # Filter by type if specified
    if preferred_type:
        data = data[data['Type'].str.contains(preferred_type.lower())]
    
    # Filter by zone if specified
    if preferred_zone:
        data = data[data['Zone'].str.contains(preferred_zone.lower())]
    
    # Filter by budget if specified
    if max_budget is not None:
        data = data[data['Entrance Fee in INR'] <= max_budget]
    
    # Filter by duration if specified
    if max_duration is not None:
        data = data[data['time needed to visit in hrs'] <= max_duration]
    
    # Sort by Google review rating and number of google reviews
    data = data.sort_values(by=['Google review rating', 'Number of google review in lakhs'], ascending=False)
    
    return data.head(10)  # return top 10 recommendations

# Example usage of the function with hypothetical user preferences
example_recommendations = recommend_places(
    data_cleaned,
    preferred_type="Park",
    preferred_zone="Northern",
    max_budget=50,
    max_duration=2.0
)

example_recommendations


Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Weekly Off,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit
9,northern,delhi,delhi,sunder nursery,park,1600,2.0,4.6,0,yes,,botanical,yes,0.16,afternoon
11,northern,delhi,delhi,lodhi garden,park,1500,1.0,4.5,0,yes,,botanical,yes,0.48,all
3,northern,delhi,delhi,waste to wonder park,theme park,2019,2.0,4.1,50,yes,monday,environmental,yes,0.27,evening
10,northern,delhi,delhi,garden of five senses,park,2003,2.0,4.1,35,yes,,botanical,yes,0.23,morning
