In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Printing config:
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.expand_frame_repr', False) # for printing full objects

input_file_path = './mappinghotelsdataset.xlsx'

p1_sheet_name = 'Partner1'
p2_sheet_name = 'Partner2'
example_sheet_name = 'examples'



In [46]:
# Load the data
xls = pd.ExcelFile(input_file_path)

p1_df = xls.parse(p1_sheet_name, encoding='utf-8')
p2_df = xls.parse(p2_sheet_name, encoding='utf-8')
match_example_df = xls.parse(example_sheet_name, encoding='utf-8')
print(match_example_df.iloc[20])


p1.key                               1A8E140E964BF7914329E25A9450E8CD
p1.hotel_name                                            Page 3 Lodge
p1.city_name                                                   Manali
p1.country_code                                                    IN
p1.hotel_address              Near Club House, Shnag Road, Old Manali
p1.star_rating                                                 1.0000
p1.postal_code                                                 175131
p2.key                               519AC2F6B5CB10AA6607E9E38672FB98
p2.hotel_name                                             Page3 Lodge
p2.city_name                                                   Manāli
p2.country_code                                                    IN
p2.hotel_address    Shenag Road, 100 Mt ahead of Old Manali, Himac...
p2.star_rating                                                 0.0000
p2.postal_code                                                 175131
Name: 20, dtype: obj

In [36]:
def colNameListByDType(df, numericCols=True):
    # # # # # # # # # #
    # Finds the names of numeric/non-numeric columns of a dataframe
    # Args:
    #       df - (pandas dataframe)
    #       numericCols - (bool), True - for numerical columns, False - for non-numerical columns
    # Return:
    #       col_name_list - (list of strings), the matched columns name
    # # # # # # # # # #
    from pandas.api.types import is_numeric_dtype

    col_name_list = list()
    for col in df.columns:
        if(numericCols): # if the numeric columns are required
            if(is_numeric_dtype(df[col]) == True):
                col_name_list += [col]
        else:   # the non-numeric columns are required
            if (is_numeric_dtype(df[col]) == False):
                col_name_list += [col]

    # apply doesn't work with is_numeric_dtype for some reason!
    #if(numericCols):
    #    col_name_list = df.columns[df.apply(lambda x: is_numeric_dtype(x))]
    #else:
    #    col_name_list = df.columns[~np.array(df.apply(is_numeric_dtype))]

    return col_name_list

In [51]:
#### DADA ANALYSIS ####

input_list = [('p1_df', p1_df), ('p2_df', p2_df), ('match_example_df', match_example_df)]
ignore_col_set = set(['p1.hotel_address', 'p2.hotel_address',
                   'p1.hotel_name', 'p2.hotel_name',
                   'p1.key', 'p2.key'])
for (df_name, df) in input_list:
    print(("\n\n%s analysis:") % df_name)
    print("\nData shape : " + str(df.shape))  
    # Checking cols data type and existence of missing values 
    print(("\nData info: \n%s") % df.info())  
    print(("\nDMissing values: \n%s") % df.isnull().sum( )) 
    # Basic statistics:
    print(("\nSummary of numeric features: \n%s") % df.describe(include=[np.number]))  
    print(("\nSummary of non-numeric features: \n%s") % df.describe(include=['O'])) 
    
    non_numeric_cols = list(set(colNameListByDType(df, numericCols=False)) - ignore_col_set)
    for col in non_numeric_cols:
        print(("\nTop unique value (normed) count of column : %s\n%s") % 
              (col, df[col].value_counts(normalize=True)[:5]))



p1_df analysis:

Data shape : (10000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
p1.key              10000 non-null object
p1.hotel_name       10000 non-null object
p1.city_name        10000 non-null object
p1.country_code     9995 non-null object
p1.hotel_address    9999 non-null object
p1.star_rating      10000 non-null float64
p1.postal_code      8250 non-null object
dtypes: float64(1), object(6)
memory usage: 547.0+ KB

Data info: 
None

DMissing values: 
p1.key                 0
p1.hotel_name          0
p1.city_name           0
p1.country_code        5
p1.hotel_address       1
p1.star_rating         0
p1.postal_code      1750
dtype: int64

Summary of numeric features: 
       p1.star_rating
count     10,000.0000
mean           2.8527
std            1.2348
min            0.0000
25%            2.0000
50%            3.0000
75%            4.0000
max            5.0000

Summary of non-numeric features: 
               

In [57]:
# Handle missing values:
for (df_name, df) in input_list:
    df.fillna('_missing_', inplace=True)
    print(df.isnull().sum( )) 

p1.key              0
p1.hotel_name       0
p1.city_name        0
p1.country_code     0
p1.hotel_address    0
p1.star_rating      0
p1.postal_code      0
dtype: int64
p2.key              0
p2.hotel_name       0
p2.city_name        0
p2.country_code     0
p2.hotel_address    0
p2.star_rating      0
p2.postal_code      0
dtype: int64
p1.key              0
p1.hotel_name       0
p1.city_name        0
p1.country_code     0
p1.hotel_address    0
p1.star_rating      0
p1.postal_code      0
p2.key              0
p2.hotel_name       0
p2.city_name        0
p2.country_code     0
p2.hotel_address    0
p2.star_rating      0
p2.postal_code      0
dtype: int64


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, analyzer='word')
tf_idf_matrix = vectorizer.fit_transform(p1_df['p1.hotel_name']) # error on address - AttributeError: 'int' object has no attribute 'lower', but works on hotel_name that also has numbers!!
#print(tf_idf_matrix)
print(tf_idf_matrix[0])
print(p1_df.loc[0,'p1.hotel_name'])
#print(vectorizer.vocabulary_)

        

  (0, 2671)	0.710075609052
  (0, 3361)	0.684583218959
  (0, 3855)	0.164737505594
Elite Grande Hotel
{'elite': 2671, 'grande': 3361, 'hotel': 3855, 'quality': 7118, 'inn': 4067, 'west': 9504, 'chester': 1808, 'map5': 5446, 'village': 9301, 'resort': 7349, 'hampton': 3532, 'suites': 8479, 'san': 7690, 'jose': 4342, 'favehotel': 2887, 'daeng': 2208, 'tompo': 8903, 'art': 669, 'cottage': 2110, 'leisure': 5005, 'vacations': 9194, 'brook': 1419, 'stone': 8412, 'coorg': 2075, 'malmaison': 5388, 'reading': 7265, 'chengdu': 1790, 'rongcheng': 7501, 'times': 8856, 'apartment': 572, 'chunxi': 1881, 'branch': 1368, 'comfort': 2025, 'days': 2295, 'guangzhou': 3423, 'luochongwei': 5246, 'fu': 3065, 'li': 5048, 'ban': 925, 'dao': 2261, 'matsumoto': 5542, 'tourist': 8936, '1999': 35, 'bed': 1076, 'and': 475, 'breakfast': 1382, 'grand': 3359, 'luley': 5236, 'shwe': 8122, 'pyi': 7084, 'the': 8783, 'lotus': 5201, 'terraces': 8752, 'hoang': 3759, 'minh': 5774, 'chau': 1774, 'la': 4833, 'flora': 2969, 'pav