In [270]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Printing config:
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.expand_frame_repr', False) # for printing full objects

input_file_path = './mappinghotelsdataset.xlsx'

p1_sheet_name = 'Partner1'
p2_sheet_name = 'Partner2'
example_sheet_name = 'examples'



In [296]:
# # # # # # HELPER FUNCTIONS # # # # # # 

def colNameListByDType(df, numericCols=True):
    # # # # # # # # # #
    # Finds the names of numeric/non-numeric columns of a dataframe
    # Args:
    #       df - (pandas dataframe)
    #       numericCols - (bool), True - for numerical columns, False - for non-numerical columns
    # Return:
    #       col_name_list - (list of strings), the matched columns name
    # # # # # # # # # #
    from pandas.api.types import is_numeric_dtype

    col_name_list = list()
    for col in df.columns:
        if(numericCols): # if the numeric columns are required
            if(is_numeric_dtype(df[col]) == True):
                col_name_list += [col]
        else:   # the non-numeric columns are required
            if (is_numeric_dtype(df[col]) == False):
                col_name_list += [col]

    # apply doesn't work with is_numeric_dtype for some reason!
    #if(numericCols):
    #    col_name_list = df.columns[df.apply(lambda x: is_numeric_dtype(x))]
    #else:
    #    col_name_list = df.columns[~np.array(df.apply(is_numeric_dtype))]

    return col_name_list

def convertToAscii(df):
    # # # # # # # # # #
    # Convert all non-numeric columns of a dataframe to ASCII
    # Args:
    #       df - (pandas dataframe)
    # Return:
    #       df - (pandas dataframe) converted df
    # # # # # # # # # #
    import unidecode #conda install -c anaconda unidecode 
    
    def decode(x):
        if(isinstance(x, str)): 
            x = unidecode.unidecode(x)
        return x
    
    convert_cols = ['p1.city_name', 'p1.hotel_address', 'p1.hotel_name',
                   'p2.city_name', 'p2.hotel_address', 'p2.hotel_name',]
    
    for col in (set(convert_cols) & set(df.columns)):
        df.loc[:,col] = df.loc[:,col].apply(lambda x: decode(x))
    return df

def minMaxScaling(df, colsToScale):
    # # # # # # # # # #
    # Scale (MinMax) the specified columns of a dataset [0,1]
    # Note: the scaling is applied to the original df, no copy is made!
    # Args:
    #       df - (pandas dataframe), the dataset
    #       colsToScale - (list of strings), column names to scale.
    # Return:
    #       df - (pandas dataframe), the scaled dataset
    # # # # # # # # # #
    from sklearn.preprocessing import MinMaxScaler

    min_max_scaler = MinMaxScaler()
    df.loc[:, colsToScale] = min_max_scaler.fit_transform(df.loc[:, colsToScale])
    return df

def getDfSliceRowIdx(df, colName, val):
    # # # # # # # # # #
    # Finds the row indices where colName == val in a dataframe
    # Args:
    #       df - (pandas dataframe)
    #       colName - (string)
    #       val - (string / int)
    # Return:
    #       (indices list), the row indices
    # # # # # # # # # #
    return df.loc[df[colName] == val].index

def removePattern(sr, pattern="-"):
    # # # # # # # # # #
    # Remove pattern occurrances from series
    # Args:
    #       sr - (pandas series)
    #       pattern - (string)
    # Return:
    #       (pandas series), the series without the pattern
    # # # # # # # # # #
    def removePtrn(x, pattern):
        if(isinstance(x, str)): 
            x = x.replace(pattern, "")
        return x
    
    return sr.apply(lambda x: removePtrn(x, pattern))
    

In [297]:
# Load the data
xls = pd.ExcelFile(input_file_path)

p1_df = xls.parse(p1_sheet_name, encoding='utf-8')
p2_df = xls.parse(p2_sheet_name, encoding='utf-8')
match_example_df = xls.parse(example_sheet_name, encoding='utf-8')

In [298]:
# # # # # # DADA ANALYSIS # # # # # # 

input_list = [('p1_df', p1_df), ('p2_df', p2_df), ('match_example_df', match_example_df)]
ignore_col_set = set(['p1.hotel_address', 'p2.hotel_address',
                   'p1.hotel_name', 'p2.hotel_name',
                   'p1.key', 'p2.key'])
for (df_name, df) in input_list:
    print(("\n\n%s analysis:") % df_name)
    print("\nData shape : " + str(df.shape))  
    # Checking cols data type and existence of missing values 
    print(("\nData info: \n%s") % df.info())  
    print(("\nDMissing values: \n%s") % df.isnull().sum( )) 
    # Basic statistics:
    print(("\nSummary of numeric features: \n%s") % df.describe(include=[np.number]))  
    print(("\nSummary of non-numeric features: \n%s") % df.describe(include=['O'])) 
    
    non_numeric_cols = list(set(colNameListByDType(df, numericCols=False)) - ignore_col_set)
    for col in non_numeric_cols:
        print(("\nTop unique value (normed) count of column : %s\n%s") % 
              (col, df[col].value_counts(normalize=True)[:5]))

p1_df[0:4]




p1_df analysis:

Data shape : (10000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
p1.key              10000 non-null object
p1.hotel_name       10000 non-null object
p1.city_name        10000 non-null object
p1.country_code     9995 non-null object
p1.hotel_address    9999 non-null object
p1.star_rating      10000 non-null float64
p1.postal_code      8250 non-null object
dtypes: float64(1), object(6)
memory usage: 547.0+ KB

Data info: 
None

DMissing values: 
p1.key                 0
p1.hotel_name          0
p1.city_name           0
p1.country_code        5
p1.hotel_address       1
p1.star_rating         0
p1.postal_code      1750
dtype: int64

Summary of numeric features: 
       p1.star_rating
count     10,000.0000
mean           2.8527
std            1.2348
min            0.0000
25%            2.0000
50%            3.0000
75%            4.0000
max            5.0000

Summary of non-numeric features: 
               

Unnamed: 0,p1.key,p1.hotel_name,p1.city_name,p1.country_code,p1.hotel_address,p1.star_rating,p1.postal_code
0,5E876BFEA81A39E42E3019FE17303D52,Elite Grande Hotel,Manama,BH,"Bldg 3378, Road 2845, Area 428",4.0,5458
1,4F315989358CC0F3F7869F569887743D,Quality Inn West Chester,West Chester (OH),US,8567 Cincinnati Dayton Road,3.0,45069
2,A4EEBCBB9932DADE591248DFFFBDC068,MAP5 Village Resort,Goa,IN,Vithaldas Wadoo,3.0,403512
3,2833BE9FD49A063A36D3DE1E5E28ABC4,Hampton Inn & Suites San Jose Hotel,San Jose (CA),US,55 Old Tully Road,3.0,95111


In [299]:
# # # # # # DADA PREPROCESSING # # # # # # 

# Convert all text columns to ASCII:
p1_df = convertToAscii(p1_df)
p2_df = convertToAscii(p2_df)
match_example_df = convertToAscii(match_example_df)

print(match_example_df.iloc[20])

# Handle missing values:
for (df_name, df) in input_list:
    df.fillna('', inplace=True)
    print(df.isnull().sum( )) 
#print(sum(p1_df["p1.hotel_address"].str.isnumeric() == True))

# Scale star_rating to [0,1] scale:
p1_df = minMaxScaling(p1_df, ['p1.star_rating'])
p2_df = minMaxScaling(p2_df, ['p2.star_rating'])
match_example_df = minMaxScaling(match_example_df, ['p1.star_rating', 'p2.star_rating'])

# Remove '-' occurrences in postal_code:
p1_df.loc[:,'p1.postal_code'] = removePattern(p1_df.loc[:,'p1.postal_code'], pattern="-")
p2_df.loc[:,'p2.postal_code'] = removePattern(p2_df.loc[:,'p2.postal_code'], pattern="-")
match_example_df.loc[:,'p1.postal_code'] = removePattern(match_example_df.loc[:,'p1.postal_code'], pattern="-")
match_example_df.loc[:,'p2.postal_code'] = removePattern(match_example_df.loc[:,'p2.postal_code'], pattern="-")

match_example_df[0:4]


p1.key                               1A8E140E964BF7914329E25A9450E8CD
p1.hotel_name                                            Page 3 Lodge
p1.city_name                                                   Manali
p1.country_code                                                    IN
p1.hotel_address              Near Club House, Shnag Road, Old Manali
p1.star_rating                                                 1.0000
p1.postal_code                                                 175131
p2.key                               519AC2F6B5CB10AA6607E9E38672FB98
p2.hotel_name                                             Page3 Lodge
p2.city_name                                                   Manali
p2.country_code                                                    IN
p2.hotel_address    Shenag Road, 100 Mt ahead of Old Manali, Himac...
p2.star_rating                                                 0.0000
p2.postal_code                                                 175131
Name: 20, dtype: obj

Unnamed: 0,p1.key,p1.hotel_name,p1.city_name,p1.country_code,p1.hotel_address,p1.star_rating,p1.postal_code,p2.key,p2.hotel_name,p2.city_name,p2.country_code,p2.hotel_address,p2.star_rating,p2.postal_code
0,074BF1CC1F1C150E080EBB9855D23EAC,Grand Malioboro Hotel,Jambi,ID,Jl. Iskandar Muda no. 168 Jambi,0.6,,CBEF956F35D16548C939056575C7E0C7,Grand Malioboro Hotel,Jambi,ID,"Jalan Iskandar Muda No. 168, Sei Asam, Pasar J...",0.6,36113
1,103756D573E5A0C80ED374C9637DB142,Full House Resort,Mae Suai / Wiang Pa Pao (Chiang Rai),TH,171 Moo.1 Tambol Pa Ngiu Ampor Wiangpapao Chia...,0.6,57170.0,603122E5379E7354B532B876BB149C8D,Fullhouse Resort,Ban Pa Ngiu,TH,171 Moo.1 Tambol Pangiu Chiangrai,0.6,57170
2,341522B8405C1A26EA3786E3F702AA0C,Ondas Do Mar Beach Resort Phase 1,Goa,IN,"Holiday St, Gaura Waddo",0.4,403516.0,DC03E5D79546157E53DB083C349BF03F,Ondas Do Mar Beach Resort Phase -1,Calangute,IN,"Holiday Street, Gaura Waddo, Calangute,Bardez,Goa",0.0,403516
3,85F1C44C25A14B5E918B6D2917FEF5E2,7 Days Inn Guangzhou Baiyun Yongtai Metro 2nd ...,Guangzhou,CN,"Building 3, No. 116 Tongtai Road",0.3,,739B4F2C65D4DBCE62C30CE8C18FCBFF,7Days Inn Guangzhou Baiyun Yongtai Subway Stat...,Guangzhou,CN,"No. 3 Building, No. 116 Tongtai Road, Baiyun D...",0.4,510420


In [300]:
from sklearn.feature_extraction.text import TfidfVectorizer

def getTfidfFeatures(p1_sr, p2_sr, tf_threshold):
    # # # # # # # # # #
    # Generate tfidf features (fitted on both inputs)
    # Args:
    #       p1_sr - (pandas series of strings)
    #       p2_sr - (pandas seriesof strings)
    #       tf_threshold - (float [0,1]), represents the threshold for defining the stop words
    # Return:
    #       (p1_tfidf_csr, p2_tfidf_csr) - (tuple of numpy csr matrices), the tf-idf matrices
    # # # # # # # # # #
    vectorizer = TfidfVectorizer(max_df=tf_threshold, binary=True, analyzer='word', encoding='utf-8') #, decode_error='ignore') #min_df=1, 
    
    # Fit on both p1_sr and p2_sr:
    vectorizer = vectorizer.fit(pd.concat([p1_sr, p2_sr])) # error on address - AttributeError: 'int' object has no attribute 'lower', but works on hotel_name that also has numbers!!
    print(("\nVocabulary length: %d") % len(vectorizer.vocabulary_))
    print(("\nData-driven stop words - appeared in more than %d of the transactions:\n%s") % 
          (tf_threshold * (len(p1_sr)+len(p2_sr)), vectorizer.stop_words_))
    
    p1_tfidf_csr = vectorizer.transform(p1_sr) 
    p2_tfidf_csr = vectorizer.transform(p2_sr) 

    return (p1_tfidf_csr, p2_tfidf_csr)


In [301]:
# # # # # # HELPER FUNCTIONS - distance measurements # # # # # # 

def calcCosineSim(vec_csr, mat_csr):
    # # # # # # # # # #
    # Compute cosine similarity between a vector and a (row-normalized) matrix (of tfidf)
    # Args:
    #       vec_csr - (numpy sparse vector)
    #       mat_csr - (numpy sparse matrix)
    # Return:
    #       (list of floats), the dot product of the vector and the matrix
    # # # # # # # # # #
    from sklearn.metrics.pairwise import linear_kernel
    return linear_kernel(vec_csr, mat_csr).flatten()

def levenshteinDist(s1, s2):
    # Copied from (1st version): https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def levenshteinSim(s1, s2_sr):
    # # # # # # # # # #
    # Compute the similarity based on the levenshtein distance between a string and a vector of strings 
    # similarity = 1 - (levenshtein(s1,s2)/max(len(s1),len(s2)))
    # Args:
    #       s1 - (string)
    #       s2_sr - (numpy series of strings)
    # Return:
    #       sim_list - (list of floats), list of similarities
    # # # # # # # # # #
    sim_list = []
    for s2 in s2_sr:
        sim_list.append(1 - (levenshteinDist(s1,s2)/max(len(s1),len(s2))))
    return sim_list


In [111]:
def 


0


In [304]:
country = 'PL'
p1_country_idx = getDfSliceRowIdx(p1_df, colName='p1.country_code', val=country)
p2_country_idx = getDfSliceRowIdx(p2_df, colName='p2.country_code', val=country)

#print(p1_tfidf_csr[country_idx,:])
#print(vectorizer.vocabulary_['europeum'])
#[((i, j), p1_tfidf_csr[i,j]) for i, j in zip(*p1_tfidf_csr.nonzero())]
#p1_df.loc[country_idx]

# hote_name tfidf matrices:
print("\nApply tf-idf on hotel_name")
p1_hotel_tfidf_csr, p2_hotel_tfidf_csr = getTfidfFeatures(p1_df['p1.hotel_name'], 
                                                          p2_df['p2.hotel_name'], 
                                                          tf_threshold=0.012)    
#print(type(p1_hotel_tfidf_csr))
#print(p1_hotel_tfidf_csr[0:6])
#print(p1_df.loc[0:5,'p1.hotel_name'])

"""
# hotel_address tfidf matrices:
print("\nApply tf-idf on hotel_address")
print(type(p1_df['p1.hotel_address'].values.astype('U')))
print(p1_df['p1.hotel_address'].values.astype('U'))

#p1_add_tfidf_csr, p2_add_tfidf_csr = getTfidfFeatures(p1_df['p1.hotel_address'].values.astype('U'), 
#                                                          p2_df['p2.hotel_address'].values.astype('U'), 
#                                                          tf_threshold=0.012)    

"""

#my_s1 = p1_df.loc[p1_df['p1.country_code'] == 'PL'].loc[1747,'p1.city_name']
#my_s2 = p2_df.loc[p2_df['p2.country_code'] == 'PL'].loc[:,'p2.city_name']



p2_not_matched_idx_list = list(p2_country_idx)
matched_keys_list = []
total_sim_threshold = 2.5


for i in p1_country_idx:
    # p2_cols_sim_df is a dataframe that will contain the similarity scores of each column value of p1 row (index=i)
    # with all related p2 rows (same country):
    p2_cols_sim_df = pd.DataFrame(); 
    p2_cols_sim_df['p2_idx'] = p2_country_idx
    
    # hotel_name similarity
    p2_cols_sim_df['hotel_name'] = calcCosineSim(p1_hotel_tfidf_csr[i,:], 
                                                 p2_hotel_tfidf_csr[p2_country_idx,:])
    # city_name similarity:
    p2_cols_sim_df['city_name'] = levenshteinSim(p1_df.loc[i,'p1.city_name'], 
                                                 p2_df.loc[p2_country_idx,'p2.city_name'])  
    # postal_code similarity:
    p2_cols_sim_df['postal_code'] = levenshteinSim(str(p1_df.loc[i,'p1.postal_code']), 
                                                   p2_df.loc[p2_country_idx,'p2.postal_code'].apply(lambda x: str(x)))
    # star_rating similarity:
    p2_cols_sim_df['star_rating'] = list(p2_df.loc[p2_country_idx,'p2.star_rating'].apply(
                                    lambda x: (1 - abs(x - p1_df.loc[i,'p1.star_rating']))))
                                                   
    
    print("\n\n")
    print(i)
    # Rename p2_cols_sim_df indices (original p2_df indices)
    p2_cols_sim_df = p2_cols_sim_df.set_index('p2_idx')
    
    print(p2_cols_sim_df)
    print("\n")
    
    # Sum all similarity scores - represents the total similarity of p1 row (index=i) 
    # with all related p2 rows (same country):
    p2_total_sims = p2_cols_sim_df.sum(axis=1)
    
    print(p2_total_sims[p2_not_matched_idx_list])
    
    # Pick the most similar available p2 row 
    # (available = not matched yet with any of the previouse p1 rows):
    p2_max_sim_idx = p2_total_sims[p2_not_matched_idx_list].idxmax()
    print(("\np2_max_sim_idx: %s") % str(p2_max_sim_idx))
    if p2_total_sims[p2_max_sim_idx] >= total_sim_threshold:
        matched_keys_list.append((p1_df.loc[i,'p1.key'], p2_df.loc[p2_max_sim_idx, 'p2.key']))
        print(("\np2_not_matched_idx_list BEFORE remove: %s") % p2_not_matched_idx_list)
        p2_not_matched_idx_list.remove(p2_max_sim_idx)
        print(("\np2_not_matched_idx_list AFTER remove: %s") % p2_not_matched_idx_list)

print(("\nMatched keys: %s") % matched_keys_list)
    
    



Apply tf-idf on hotel_name

Vocabulary length: 10843

Data-driven stop words - appeared in more than 240 of the transactions:
{'branch', 'lodge', 'airport', 'western', 'hostel', 'apartment', 'inn', 'guest', 'quality', 'spa', 'boutique', 'comfort', 'grand', 'resort', 'guesthouse', 'and', 'suites', 'best', 'road', 'villa', 'city', 'motel', 'hotel', 'house', 'park', 'by', 'the', 'apartments', 'beach'}



1747
        hotel_name  city_name  postal_code  star_rating
p2_idx                                                 
889         0.0000     0.2857       0.2000       1.0000
893         0.0000     0.4286       0.2000       1.0000
1468        0.0000     0.1250       0.4000       0.8000
4000        1.0000     1.0000       1.0000       1.0000
4250        0.0000     1.0000       0.2000       1.0000
4447        0.0000     0.2857       0.2000       1.0000
6059        0.0000     0.2857       0.0000       1.0000
7281        0.0000     1.0000       0.2000       0.6000
8408        0.0000     0.2857

In [303]:
p1_df.loc[p1_df['p1.country_code'] == 'PL']

Unnamed: 0,p1.key,p1.hotel_name,p1.city_name,p1.country_code,p1.hotel_address,p1.star_rating,p1.postal_code
1747,855F5C744A5242A90371911D97178D37,Hotel Europeum,Wroclaw,PL,ul. Kazimierza Wielkiego 27A,0.6,50077
3688,D16FF76BAB825C9A31365A1F40F27991,Sheraton Poznan Hotel,Poznan,PL,Bukowska 3/9,1.0,60809
3987,B3DA14EB52B0E3E3A3484E31232B0D81,Hampton by Hilton Krakow,Krakow,PL,ul Dabska 5,0.9,31572
4398,C631CE417807424F7A80F471608095A2,Hotel Ikar,Poznan,PL,Kosciuszki 118,0.6,61717
5360,6B7623E77E5082B180F0450294532A1C,MCC Mazurkas Conference Centre Hotel,Warsaw,PL,Poznanska 177,0.8,5850
5391,C8FCED7B50C647DA1232868D36E50DA2,Hotel Platinum Palace,Wroclaw,PL,ul. Powstancow Slaskich 204,1.0,53125
8875,C163B4D369F76B8BA16D653C6A18A252,Sound Garden Hotel,Warsaw,PL,Zwirki i Wigury 18,0.6,2092
8959,E91EABECD9B37D59C96A9BD3C5EBB0E4,Angelo By Vienna House Katowice,Katowice,PL,Sokolska 24,0.8,40881
9302,0522BCC6D1E694870CFB06786AA2102E,Campanile Hotel Wroclaw Centrum,Wroclaw,PL,ul. Slezna 26,0.4,53302
9618,A07FED54CB59D6AA5E71369AC198EBC9,Hotel Europejski,Krakow,PL,Lubicz Street 5,0.6,31034


In [305]:
p2_df.loc[p2_df['p2.country_code'] == 'PL']

Unnamed: 0,p2.key,p2.hotel_name,p2.city_name,p2.country_code,p2.hotel_address,p2.star_rating,p2.postal_code
889,8C26208CF9BA6A6EEB781E0F3C3DDBAE,Hotel Europejski,Krakow,PL,ul. Lubicz 5,0.6,31034
893,9FED8EF2075A8980986C9AAF3CFDAE34,Sound Garden Hotel Airport,Warsaw,PL,Zwirki i Wigury 18,0.6,2092
1468,08A9906601C81AD41D2EFCAF48B70357,angelo by Vienna House Katowice,Katowice,PL,Sokolska 24,0.8,40086
4000,2B84ADB4DD0CEAA6F250417BFFEB45AF,Europeum Hotel,Wroclaw,PL,Kazimierza Wielkiego 27a,0.6,50077
4250,80FE7BE71BE9A5AEF8C5027056199E25,Campanile Wroclaw Centrum,Wroclaw,PL,Slezna 26,0.6,53302
4447,353A2707CB8D30E9D8CD226DE3F8B1B1,Hampton by Hilton Krakow,Krakow,PL,Dabska 5,0.6,31572
6059,F8E64EF83E6F6881CECA3A3922921DDD,Hotel Ikar,Poznan,PL,ul. Solna 18,0.6,61736
7281,7EA84D4A335D05F0A7E794DD42FCAF3F,Platinum Palace Boutique Hotel,Wroclaw,PL,Powstancow Slaskich 204,1.0,53140
8408,6DCE8475195975854FF7734027F90088,Sheraton Poznan Hotel,Poznan,PL,ul. Bukowska 3/9,1.0,60809
8941,0573589C2B18BAE3FA72E2082E7FB2AE,Hotel Mazurkas,Ozarow Mazowiecki,PL,Poznanska 177,0.8,5850


In [129]:
p1_df['p1.country_code'].value_counts(normalize=False)[50:60]

CO    13
KE    12
FJ    12
TZ    12
IE    11
HR    10
RO    10
PF    10
BE    10
PL    10
Name: p1.country_code, dtype: int64

In [202]:

    

"""
for i in range(0,len(cosine_sim)):
    sim_items_idx = cosine_sim.argsort()
    print(sim_items_idx)
    """
"""
related_docs_indices = cosine_sim.argsort()[:-5:-1]
related_docs_indices
cosine_sim[related_docs_indices]"""



  (0, 3055)	1.0
  (1, 7646)	0.836813664214
  (1, 8819)	0.547487800215
  (2, 3896)	0.467019102547
  (2, 4104)	0.428468746181
  (2, 5234)	0.773503517382
  (3, 4410)	1.0
  (4, 1823)	0.368848687425
  (4, 2231)	0.412560485684
  (4, 6133)	0.576909276626
  (4, 6137)	0.600766325604
  (5, 7195)	0.518763190193
  (5, 7560)	0.854917979985
  (6, 3481)	0.519353972633
  (6, 9114)	0.854559214514
  (7, 540)	0.577531021824
  (7, 4961)	0.650407184163
  (7, 10183)	0.493384650774
  (8, 1691)	0.491645373126
  (8, 1828)	0.56121643435
  (8, 10539)	0.665823505818
  (9, 3054)	1.0



1747
        hotel_name
p2_idx            
889         0.0000
893         0.0000
1468        0.0000
4000        1.0000
4250        0.0000
4447        0.0000
6059        0.0000
7281        0.0000
8408        0.0000
8941        0.0000


p2_idx
889    0.0000
893    0.0000
1468   0.0000
4000   1.0000
4250   0.0000
4447   0.0000
6059   0.0000
7281   0.0000
8408   0.0000
8941   0.0000
dtype: float64

max_sim_val: 4000

p2_not_matched_idx_

'\nrelated_docs_indices = cosine_sim.argsort()[:-5:-1]\nrelated_docs_indices\ncosine_sim[related_docs_indices]'

In [163]:
#Int64Index([1747, 3688, 3987, 4398, 5360, 5391, 8875, 8959, 9302, 9618], dtype='int64')


Unnamed: 0,p1.key,p1.hotel_name,p1.city_name,p1.country_code,p1.hotel_address,p1.star_rating,p1.postal_code
1747,855F5C744A5242A90371911D97178D37,Hotel Europeum,Wroclaw,PL,ul. Kazimierza Wielkiego 27A,3.0,50-077
3688,D16FF76BAB825C9A31365A1F40F27991,Sheraton Poznan Hotel,Poznan,PL,Bukowska 3/9,5.0,60809
3987,B3DA14EB52B0E3E3A3484E31232B0D81,Hampton by Hilton Krakow,Krakow,PL,ul Dabska 5,4.5,31-572
4398,C631CE417807424F7A80F471608095A2,Hotel Ikar,Poznan,PL,Kosciuszki 118,3.0,61-717
5360,6B7623E77E5082B180F0450294532A1C,MCC Mazurkas Conference Centre Hotel,Warsaw,PL,Poznanska 177,4.0,05-850
5391,C8FCED7B50C647DA1232868D36E50DA2,Hotel Platinum Palace,Wroclaw,PL,ul. Powstancow Slaskich 204,5.0,53-125
8875,C163B4D369F76B8BA16D653C6A18A252,Sound Garden Hotel,Warsaw,PL,Zwirki i Wigury 18,3.0,02-092
8959,E91EABECD9B37D59C96A9BD3C5EBB0E4,Angelo By Vienna House Katowice,Katowice,PL,Sokolska 24,4.0,40-881
9302,0522BCC6D1E694870CFB06786AA2102E,Campanile Hotel Wroclaw Centrum,Wroclaw,PL,ul. Slezna 26,2.0,53-302
9618,A07FED54CB59D6AA5E71369AC198EBC9,Hotel Europejski,Krakow,PL,Lubicz Street 5,3.0,31-034


In [164]:
p2_df.loc[p2_df['p2.country_code'] == 'PL']


Unnamed: 0,p2.key,p2.hotel_name,p2.city_name,p2.country_code,p2.hotel_address,p2.star_rating,p2.postal_code
889,8C26208CF9BA6A6EEB781E0F3C3DDBAE,Hotel Europejski,Kraków,PL,ul. Lubicz 5,3.0,31-034
893,9FED8EF2075A8980986C9AAF3CFDAE34,Sound Garden Hotel Airport,Warsaw,PL,Żwirki i Wigury 18,3.0,02-092
1468,08A9906601C81AD41D2EFCAF48B70357,angelo by Vienna House Katowice,Katowice,PL,Sokolska 24,4.0,40-086
4000,2B84ADB4DD0CEAA6F250417BFFEB45AF,Europeum Hotel,Wrocław,PL,Kazimierza Wielkiego 27a,3.0,50-077
4250,80FE7BE71BE9A5AEF8C5027056199E25,Campanile Wroclaw Centrum,Wrocław,PL,Ślężna 26,3.0,53-302
4447,353A2707CB8D30E9D8CD226DE3F8B1B1,Hampton by Hilton Krakow,Kraków,PL,Dąbska 5,3.0,31-572
6059,F8E64EF83E6F6881CECA3A3922921DDD,Hotel Ikar,Poznań,PL,ul. Solna 18,3.0,61-736
7281,7EA84D4A335D05F0A7E794DD42FCAF3F,Platinum Palace Boutique Hotel,Wrocław,PL,Powstańców Śląskich 204,5.0,53-140
8408,6DCE8475195975854FF7734027F90088,Sheraton Poznan Hotel,Poznań,PL,ul. Bukowska 3/9,5.0,60-809
8941,0573589C2B18BAE3FA72E2082E7FB2AE,Hotel Mazurkas,Ożarów Mazowiecki,PL,Poznańska 177,4.0,05-850


3
889                Krakow
893                Warsaw
1468             Katowice
4000              Wroclaw
4250              Wroclaw
4447               Krakow
6059               Poznan
7281              Wroclaw
8408               Poznan
8941    Ozarow Mazowiecki
Name: p2.city_name, dtype: object

lev(Wroclaw,Krakow): 5
max len: 7
normed lev: 0.714286
sim: 0.285714
sim_list: [0.2857142857142857]

lev(Wroclaw,Warsaw): 4
max len: 7
normed lev: 0.571429
sim: 0.428571
sim_list: [0.2857142857142857, 0.4285714285714286]

lev(Wroclaw,Katowice): 7
max len: 8
normed lev: 0.875000
sim: 0.125000
sim_list: [0.2857142857142857, 0.4285714285714286, 0.125]

lev(Wroclaw,Wroclaw): 0
max len: 7
normed lev: 0.000000
sim: 1.000000
sim_list: [0.2857142857142857, 0.4285714285714286, 0.125, 1.0]

lev(Wroclaw,Wroclaw): 0
max len: 7
normed lev: 0.000000
sim: 1.000000
sim_list: [0.2857142857142857, 0.4285714285714286, 0.125, 1.0, 1.0]

lev(Wroclaw,Krakow): 5
max len: 7
normed lev: 0.714286
sim: 0.285714
sim_list:

[0.2857142857142857,
 0.4285714285714286,
 0.125,
 1.0,
 1.0,
 0.2857142857142857,
 0.2857142857142857,
 1.0,
 0.2857142857142857,
 0.23529411764705888]