In [80]:
#Importing the necessary libraries
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [81]:
# Step 1: Load JSON data
file_path = '/content/information.json'  # Replace with your actual file path

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extracting the city names and their corresponding descriptions
city_data = data.get('information', {})
city_names = list(city_data.keys())
descriptions = list(city_data.values())

# Creating a DataFrame with column names city , description
df = pd.DataFrame({'city': city_names, 'description': descriptions})



In [82]:
df.head()

Unnamed: 0,city,description
0,Daska,"Daska, is a city in the Punjab province of Pak..."
1,Kuala Lumpur,"Kuala Lumpur, officially the Federal Territory..."
2,Guarulhos,Guarulhos is a Brazilian municipality. It is t...
3,Sydney,Sydney is the capital city of the state of New...
4,Ho Chi Minh City,"Ho Chi Minh City, commonly referred to by its ..."


In [83]:
df.sample(5)

Unnamed: 0,city,description
160,Berlin,Berlin is the capital and largest city of Germ...
234,Criciúma,Criciúma is a city in the Brazilian state of S...
68,Formosa,"Taiwan, officially the Republic of China (ROC)..."
496,Vasai-Virar,Vasai-Virar is an agglomeration of four previo...
497,Malaga,"Málaga is a municipality of Spain, capital of ..."


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   city         704 non-null    object
 1   description  704 non-null    object
dtypes: object(2)
memory usage: 11.1+ KB


In [85]:
df['city'].duplicated().sum()

0

In [86]:

#Applying basic preprocessing steps that include lowercasing and removal of non alpha-numeric characters.
#We will apply lowercasing to the stop words in english and to all the words except Nouns to avoid lowercasing of city names.
#To distinguish the Nouns , we apply Parts of Speech(POS) tagging using NLTK library .

In [87]:
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords

In [88]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [89]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [90]:
#Created a function to apply lowercasing to the text except Nouns to avoid lowercasing of city names
def preprocess_text(text, stop_words):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = pos_tag(tokens)  # Perform POS tagging

    processed_tokens = []
    for token, tag in tagged_tokens:
        if token.lower() in stop_words:  # Lowercase all the stop words
            processed_tokens.append(token.lower())
        elif tag.startswith('N'):  # Preserving the nouns
            processed_tokens.append(token)
        else:  # Lowercasing the other tokens
            processed_tokens.append(token.lower())

    return ' '.join(processed_tokens)

In [91]:
#Applying the preprocessing to the description column
df['description_cleaned'] = df['description'].apply(lambda x: preprocess_text(x, stop_words))

In [92]:
df[['city','description_cleaned']]

Unnamed: 0,city,description_cleaned
0,Daska,"Daska , is a city in the Punjab province of Pa..."
1,Kuala Lumpur,"Kuala Lumpur , officially the Federal Territor..."
2,Guarulhos,Guarulhos is a brazilian municipality . it is ...
3,Sydney,Sydney is the capital city of the state of New...
4,Ho Chi Minh City,"Ho Chi Minh City , commonly referred to by its..."
...,...,...
699,Sucre,"Sucre is the de jure capital city of Bolivia ,..."
700,Gorbea,Gorbea or Gorbeia is a mountain and massif in ...
701,Krakow,"Kraków , also seen spelled Cracow or absent po..."
702,Khammam,Khammam is the city in Khammam district of the...


In [93]:
# Initializing the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [94]:
# Fit and transform the descriptions
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description_cleaned'])

In [95]:
# Function to retrieve relevant cities based on user query
def retrieve_cities(query, df, tfidf_matrix, tfidf_vectorizer):
    query_cleaned = preprocess_text(query, stop_words)  # Preprocess query
    query_vector = tfidf_vectorizer.transform([query_cleaned])  # Transform query to TF-IDF vector
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()  # Calculate cosine similarity
    # Get indices of top N most similar cities (adjust N as needed)
    related_city_indices = cosine_similarities.argsort()[::-1][:2]
    relevant_cities = [(df['city'][i], df['description'][i]) for i in related_city_indices]  # Retrieve cities
    return relevant_cities

In [96]:
# Example usage of retrieve_cities function
query = "I want to visit Kuala Lumpur and Sydney next year."
relevant_cities = retrieve_cities(query, df, tfidf_matrix, tfidf_vectorizer)
print("Relevant cities based on query:", relevant_cities)

Relevant cities based on query: [('Kuala Lumpur', "Kuala Lumpur, officially the Federal Territory of Kuala Lumpur and colloquially referred to as KL, is a federal territory and the capital city of Malaysia. It is the largest city in the country, covering an area of 243\xa0km2 (94\xa0sq\xa0mi) with a census population of 2,163,000 as of 2022. Greater Kuala Lumpur, also known as the Klang Valley, is an urban agglomeration of 8.622 million people as of 2023. It is among the fastest growing metropolitan regions in Southeast Asia, both in population and economic development. Klang Valley is ASEAN's fifth largest economy after Singapore, Jakarta, Bangkok, and Manila."), ('Sydney', 'Sydney is the capital city of the state of New South Wales and the most populous city in Australia. Located on Australia\'s east coast, the metropolis surrounds Sydney Harbour and extends about 80 km from the Pacific Ocean in the east to the Blue Mountains in the west, and about 80 km from the Ku-ring-gai Chase Na

In [97]:
# Example usage of retrieve_cities function
query = "Which city is called as the orange city of India?"
relevant_cities = retrieve_cities(query, df, tfidf_matrix, tfidf_vectorizer)
print("Relevant cities based on query:", relevant_cities)



Relevant cities based on query: [('Nagpur', 'Nagpur is the third-largest city of the Indian state of Maharashtra after Mumbai and Pune. It is called the heart of India because of its central geographical location. It is the largest and most populated city in central India. Also known as the "Orange City", Nagpur is the 13th largest city in India by population. According to an Oxford\'s Economics report, Nagpur is projected to be the fifth fastest growing city in the world from 2019 to 2035 with an average growth of 8.41%. It has been proposed as one of the Smart Cities in Maharashtra and is one of the top ten cities in India in Smart City Project execution.'), ('Bangalore', 'Bangalore, officially Bengaluru, is the capital and largest city of the southern Indian state of Karnataka. It has a population of more than 8 million and a metropolitan population of around 15 million, making it India\'s third most populous city and fourth most populous urban agglomeration. It is the most populous