In [1]:
import pandas as pd 
from fuzzywuzzy import fuzz 

In [43]:
locations = pd.read_csv('./ds/in.csv')
reviews = pd.read_csv('./ds/reviews.csv')

cities = {
    'locations':list(
        locations.city.unique()
    ), 
    'reviews':list(
        reviews.location.unique()
    )
}

#replacing the city names with partial matches of each other 
def pmatch(word, matchlist, threshold = 80):
    best_match = None
    best_score = 0

    #Iterating thorugh every single  possible matches 
    for match in matchlist:

        score = fuzz.ratio(word.lower().strip(), match.lower().strip())
        if score > best_score:
            best_score = score 
            best_match = match 

    if best_score > threshold:
        return best_match
    else: 
        return None 

    
# Creating a partial mapper 
mapper = {}

for city in cities['locations']:
    mapper[city] = pmatch(city, cities['reviews'], threshold = 73)

mapper

{'Delhi': 'delhi',
 'Mumbai': 'mumbai',
 'Kolkāta': 'kolkata',
 'Bangalore': 'bangalore',
 'Chennai': 'chennai',
 'Hyderābād': 'hyderabad',
 'Pune': 'pune',
 'Ahmedabad': 'ahmedabad',
 'Sūrat': 'surat',
 'Lucknow': 'lucknow',
 'Jaipur': 'jaipur',
 'Cawnpore': None,
 'Mirzāpur': None,
 'Nāgpur': 'nagpur',
 'Ghāziābād': None,
 'Indore': 'indore',
 'Vadodara': 'vadodara',
 'Vishākhapatnam': 'visakhapatnam',
 'Bhopāl': 'bhopal',
 'Chinchvad': None,
 'Patna': 'patna',
 'Ludhiāna': 'ludhiana',
 'Āgra': 'agra',
 'Kalyān': None,
 'Madurai': 'madurai',
 'Jamshedpur': 'jamshedpur',
 'Nāsik': None,
 'Farīdābād': None,
 'Aurangābād': 'aurangabad',
 'Rājkot': 'rajkot',
 'Meerut': 'meerut',
 'Jabalpur': 'jabalpur',
 'Thāne': 'thane',
 'Dhanbād': 'dhanbad',
 'Allahābād': 'allahabad',
 'Vārānasi': 'varanasi',
 'Srīnagar': 'srinagar',
 'Amritsar': 'amritsar',
 'Alīgarh': 'aligarh',
 'Bhiwandi': None,
 'Gwalior': 'gwalior',
 'Bhilai': 'bhilai',
 'Hāora': None,
 'Rānchi': 'ranchi',
 'Bezwāda': None,
 'Ch

In [44]:
locations['city'] = locations.city.map(mapper)

In [45]:
locations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   city               124 non-null    object 
 1   lat                406 non-null    float64
 2   lng                406 non-null    float64
 3   country            406 non-null    object 
 4   iso2               406 non-null    object 
 5   admin_name         406 non-null    object 
 6   capital            38 non-null     object 
 7   population         404 non-null    float64
 8   population_proper  404 non-null    float64
dtypes: float64(4), object(5)
memory usage: 28.7+ KB


In [47]:
dataset = pd.merge(
    left = reviews, 
    right = locations, 
    left_on = 'location', 
    right_on = 'city', 
    how = 'inner'
).drop(
    columns = [
        'Unnamed: 0', 
        'city', 
    ]
).drop_duplicates().to_csv(
    './ds/PlaceRecommender.csv', 
    index = False 
)