# IMPORT LIBRARIES

In [396]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# READING DATA SCRAPED FROM WEBSITE

In [398]:
df = pd.read_csv("data.csv")

In [403]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   city    100 non-null    object
 1   url     100 non-null    object
 2   desc    100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [404]:
df.isnull().sum()

city    0
url     0
desc    0
dtype: int64

In [405]:
df.duplicated().sum()

0

In [402]:
df['city'].nunique()

100

__IN OUR DATA THERE ARE 100 UNIQUE CITIES IN INDIA__

*In the future, if more cities are obtained, I will concatenate them to the same DataFrame and use the same approach for it.*

# TEXT PREPROCESSING

In [414]:
import string
from nltk.corpus import stopwords

In [416]:
# 1 . MAKING ALL TEXT IN DESCRIPTION LOWERCASE
df['desc'] = df['desc'].apply(str.lower)

# 2. REMOVING PUCUATION MARKS IF ANY
df['desc'] = df['desc'].apply(lambda x : "".join([i for i in x if i not in string.punctuation]))

# 3. REMOVING STOPWORDS
df['desc'] = df['desc'].apply(lambda x : " ".join([i for i in x.split() if i not in stopwords.words('english')]))

# 4. REMOVING ALL ALPHA-NUMERIC VALUES
df['desc'] = df['desc'].apply(lambda x : " ".join([i for i in x.split() if i.isalpha()]))

In [418]:
# PROCESSED DATAFRAME LOOKS LIKE
df.head(5)

Unnamed: 0,city,url,desc
0,Srinagar,https://www.holidify.com/images/bgImages/SRINA...,famously known heaven earth srinagar located u...
1,Leh,https://www.holidify.com/images/bgImages/LADAK...,ladakh union territory kashmir region india fo...
2,Gangtok,https://www.holidify.com/images/bgImages/GANGT...,incredibly alluring pleasantly boisterous wrea...
3,Andaman,https://www.holidify.com/images/bgImages/ANDAM...,replete turquoise blue water beaches bit histo...
4,Manali,https://www.holidify.com/images/bgImages/MANAL...,spectacular valleys breathtaking views snowcap...


# VECTORIZATION

In [423]:
from sklearn.feature_extraction.text import CountVectorizer

In [424]:
cv = CountVectorizer(max_features=1000)
vector = cv.fit_transform(df['desc']).toarray()

In [425]:
vector.shape

(100, 1000)

In [426]:
list(cv.get_feature_names_out())

['aap',
 'absolutely',
 'abu',
 'accessibility',
 'according',
 'acres',
 'across',
 'activities',
 'activity',
 'adjacent',
 'administered',
 'adorned',
 'adorns',
 'adventure',
 'aerial',
 'agartala',
 'ages',
 'agra',
 'ahmedabad',
 'ahmednagar',
 'airport',
 'ajanta',
 'ajmer',
 'alappuzha',
 'alibag',
 'alibaug',
 'alleppey',
 'allures',
 'alluring',
 'almora',
 'along',
 'alongside',
 'also',
 'although',
 'altitude',
 'always',
 'amalgamation',
 'amarnath',
 'ambarsar',
 'amidst',
 'amm',
 'among',
 'amongst',
 'amritsar',
 'amsterdam',
 'ancient',
 'andaman',
 'andhra',
 'anglong',
 'announced',
 'antiques',
 'apart',
 'apple',
 'arabian',
 'aravali',
 'aravalli',
 'architectural',
 'architecture',
 'area',
 'areas',
 'around',
 'array',
 'arrived',
 'art',
 'ashrams',
 'asia',
 'asiatic',
 'assam',
 'attained',
 'attraction',
 'attractions',
 'attracts',
 'auli',
 'aurangabad',
 'aurangzeb',
 'away',
 'ayurvedic',
 'baba',
 'back',
 'backdrop',
 'background',
 'backpackers',
 

__EACH CITY DESCRIPTION IS CONVERTED TO VECTOR WITH 1000 MOST FREQUENT FETURES__

# COSINE SIMILARITY 

In [427]:
from sklearn.metrics.pairwise import cosine_similarity

In [449]:
similarity = cosine_similarity(vector)
similarity.shape

(100, 100)

__IT GIVES COSINE SIMILARITY OF EACH CITY WITH EVERY OTHER CITY__

In [451]:
# COSINE SIMILARITY OS SRINAGAR WITH EVERY OTHER CITY
similarity[0]

array([1.        , 0.17521916, 0.        , 0.05634362, 0.06172134,
       0.08748178, 0.11461365, 0.1385685 , 0.        , 0.05730683,
       0.02608203, 0.        , 0.07715167, 0.03086067, 0.10744306,
       0.05372153, 0.02727724, 0.05939139, 0.1385685 , 0.02686077,
       0.03086067, 0.05292561, 0.        , 0.        , 0.03026138,
       0.08451543, 0.09449112, 0.12858612, 0.        , 0.05634362,
       0.        , 0.14847847, 0.03217447, 0.        , 0.0554274 ,
       0.        , 0.        , 0.02865341, 0.0277137 , 0.        ,
       0.13041013, 0.0831411 , 0.19094065, 0.05216405, 0.0554274 ,
       0.05939139, 0.        , 0.05832118, 0.08478078, 0.06299408,
       0.08451543, 0.02326211, 0.10910895, 0.09869275, 0.0554274 ,
       0.07319251, 0.05634362, 0.        , 0.        , 0.08908708,
       0.05939139, 0.04600437, 0.        , 0.05372153, 0.04941662,
       0.        , 0.        , 0.02865341, 0.10432811, 0.        ,
       0.0277137 , 0.05143445, 0.12104551, 0.05730683, 0.08451

# CREATING FUNCTION TO TAKE INPUT

In [452]:
def simillarCity(name):
    data = pd.DataFrame(columns=['city', 'url', 'desc'])                                         # CREATING EMPTY DATAFRAME TO STORE RECOMMENDATIONS
    cityIndex  = df[df['city'] == name].index[0]                                                 # FINDING INDEX OF SEARCED CITY IN DATAFRAME
    cosineSimillarity = list(enumerate(similarity[cityIndex]))                                   # FINDING COSINE SIMILARITY OF SEARCHED CITY WITH OTHER
    relatedCityDistance = sorted(cosineSimillarity, reverse=True, key= lambda x : x[1])[0:15]    # SORTING(DESCENDING) COSINE SIMILLARITY BASED 
    for city in relatedCityDistance:
        j = city[0]
        data = pd.concat((data, pd.DataFrame([originalData.iloc[j]])))                           # STORING TO DATAFRAME
    return(data)

In [453]:
simillarCity('Mahabaleshwar')

Unnamed: 0,city,url,desc
59,Mahabaleshwar,https://www.holidify.com/images/bgImages/MAHAB...,Mahabaleshwar is a hill station located in th...
13,Munnar,https://www.holidify.com/images/bgImages/MUNNA...,"Famous for the tea estates, greenery, winding..."
28,Lonavala,https://www.holidify.com/images/bgImages/LONAV...,Situated in the Sahyadri range of the Western...
21,Nainital,https://www.holidify.com/images/bgImages/NAINI...,Nainital is a charming hill station located a...
76,Khandala,https://www.holidify.com/images/bgImages/KHAND...,Khandala is a popular hill station in Maharas...
49,Alibaug,https://www.holidify.com/images/bgImages/ALIBA...,Alibaug (also spelled as Alibag) is a small c...
79,Matheran,https://www.holidify.com/images/bgImages/MATHE...,Nestled amidst the Sahyadri range on the West...
25,Mussoorie,https://www.holidify.com/images/bgImages/MUSSO...,Mussoorie is one of the most popular hill sta...
74,Dharamshala,https://www.holidify.com/images/bgImages/DHARA...,Dharamshala is a beautiful hill town in the D...
18,Rishikesh,https://www.holidify.com/images/bgImages/RISHI...,Located in the foothills of the Himalayas alo...


In [454]:
simillarCity('Haridwar')

Unnamed: 0,city,url,desc
44,Haridwar,https://www.holidify.com/images/bgImages/HARID...,Haridwar is one of the seven holiest cities i...
18,Rishikesh,https://www.holidify.com/images/bgImages/RISHI...,Located in the foothills of the Himalayas alo...
64,Mathura,https://www.holidify.com/images/bgImages/MATHU...,"One of Hinduism's seven sacred cities, Mathur..."
12,Varanasi,https://www.holidify.com/images/bgImages/VARAN...,"World's oldest living city, Varanasi - also k..."
38,Ujjain,https://www.holidify.com/images/bgImages/UJJAI...,"Ujjain, considered to be one of the holiest c..."
82,Bhubaneswar,https://www.holidify.com/images/bgImages/BHUBA...,"Bhubaneswar, the temple city of India, once k..."
48,Tirupati,https://www.holidify.com/images/bgImages/TIRUP...,Situated in the Chittoor district of Andhra P...
88,Puri,https://www.holidify.com/images/bgImages/PURI.jpg,Puri in Odisha is one of the four must-visit ...
87,Dehradun,https://www.holidify.com/images/bgImages/DEHRA...,Dehradun is the capital city and the largest ...
15,Amritsar,https://www.holidify.com/images/bgImages/AMRIT...,"Amritsar, colloquially known as Ambarsar, is ..."


In [455]:
simillarCity('Pune')

Unnamed: 0,city,url,desc
69,Pune,https://www.holidify.com/images/bgImages/PUNE.jpg,"Pune is a bustling metropolis of Maharashtra,..."
50,Ahmedabad,https://www.holidify.com/images/bgImages/AHMED...,"A rapidly growing metropolis, an industrial h..."
38,Ujjain,https://www.holidify.com/images/bgImages/UJJAI...,"Ujjain, considered to be one of the holiest c..."
12,Varanasi,https://www.holidify.com/images/bgImages/VARAN...,"World's oldest living city, Varanasi - also k..."
71,Ranthambore,https://www.holidify.com/images/bgImages/RANTH...,This is one of the best tiger reserves of the...
88,Puri,https://www.holidify.com/images/bgImages/PURI.jpg,Puri in Odisha is one of the four must-visit ...
51,Kanha,https://www.holidify.com/images/bgImages/KANHA...,Located in a central region of Madhya Pradesh...
70,Vrindavan,https://www.holidify.com/images/bgImages/VRIND...,One of the oldest cities on the banks of Yamu...
11,Jaipur,https://www.holidify.com/images/bgImages/JAIPU...,"Jaipur, the capital city of Rajasthan, stands..."
56,Madurai,https://www.holidify.com/images/bgImages/MADUR...,"Madurai, the cultural capital of Tamil Nadu, ..."


# STORING COSINE SIMILLARITY VALUES TO PICKLE FILE

In [457]:
import pickle

In [465]:
pickle.dump(similarity, open("cosineSimmilarity.pkl", "wb"))