## Data Cleaning and Feature Engineering  
Author: Anne Chen  
2016

### Import Modules

In [1]:
import pandas as pd
import numpy as np
import json
import re
import nltk
from nltk import *
from textblob import TextBlob
from fancyimpute import KNN
# NOTE:
# if this is the first time a user is using nltk
# he/she must run the following code: nltk.download('all')

# command line for installing textblob 
# conda install -c https://conda.anaconda.org/sloria textblob

Using Theano backend.


### Functions
- Define Utility Functions

In [2]:
def read_json(filename):
    '''read json file'''
    return json.loads(open(filename).read()) 

def drop_first_column(us_museum, w_museum):
    '''drop the first column'''
    if us_museum.columns.values[0] != 'Address':
        us_museum = us_museum.drop(us_museum.columns[0], axis = 1)
        w_museum = w_museum.drop(w_museum.columns[0], axis = 1)
    return us_museum, w_museum

def merge_usa_world(us_museum, w_museum):    
    '''merge us and world dataframe'''
    merged_df = pd.merge(us_museum, w_museum, how='outer')
    merged_df = merged_df.drop_duplicates('MuseumName')
    return merged_df

def fix_fee_value(df):
    '''fix the fee value and replace yes/no with none in column "LengthOfVisit" '''    
    idx_no = np.where(df['LengthOfVisit'] == 'No ')[0].tolist()
    idx_yes = np.where(df['LengthOfVisit'] == 'Yes ')[0].tolist()
    df1 = df.copy(deep=True)
    for idx in idx_no:
        df1.loc[idx,'LengthOfVisit'] = None
        df1.loc[idx,'Fee'] = 'No '
    for idx in idx_yes:
        df1.loc[idx,'LengthOfVisit'] = None
        df1.loc[idx,'Fee'] = 'Yes '
    df = df1.copy(deep = True)
    return df

def string_to_num(df, column_name_lst):
    '''convert string in to number, e.g. 1,354 --> 1345'''
    for col_name in column_name_lst:
        df[col_name] = df[col_name].apply(lambda x: int(x.replace(',', '')))
    return df

def append_two_dict(dict1, dict2):
    '''append two dictionaries based on keys'''
    new_dict = dict1
    for key, val in dict2.items():
        if key not in dict1.keys():
            new_dict[key] = val
    return new_dict

def assign_0_or_1(df, target, dic):
    '''assign value as 1 if the values of the dictionary in the musuem match the target'''
    df = df.copy(deep=True)
    # initialize column with 0
    for sub_item in target:
        df[sub_item] = 0

    for museum_name, value in dic.items():
        # get index
        idx = df[df['MuseumName'] == museum_name].index.tolist()[0]

        for sub_item in target:
            if sub_item in value:
                df.loc[idx, sub_item] = 1  
    return df

def unicode_to_ascii(lst):
    '''convert unicode to ascii'''
    # avoid raising errors later on while writing data into csv files
    return [item.encode('ascii', 'ignore') for item in lst]

def encode_whole_dictionary(dic):
    '''convert whole dictionaty from unicode to ascii'''
    keys = dic.keys()
    values = dic.values()
    encode_key = unicode_to_ascii(keys)
    encode_val = [unicode_to_ascii(val) for val in values]
    
    # create new dictionary with encoded kay and values
    new_dic ={}
    for i in range(len(encode_key)):
        new_dic[encode_key[i]] = encode_val[i]

    return new_dic

def clean_dic(dic):
    '''clean the dictionary by removing the key-value pair with NaN'''
    return {k: dic[k] for k in dic if not isinstance(k, float)}

def dic_str_to_num(dic):
    '''convert list of strings in a dictionary into numbers'''
    new_dic = {}
    for key, val_lst in dic.items():
        new_dic[key] = [int(x.replace(',', '')) for x in val_lst]
    return new_dic

- Define Feature Engineering Related Functions

In [3]:
def add_country_state(df, state_name_lst):
    '''add new feature Country: USA or Other'''
    df['Country'] = 'Other'
    df['State'] = 'Not_in_USA'
    for idx, address in enumerate(df['Address']):
        try:
            state = address.split(',')[-1].split()[0]
            if state in state_name_lst:
                df.loc[idx,'Country'] = 'USA'
                df.loc[idx,'State'] = state
        except:
            pass

def add_category_feature(df, category):
    '''add several museum type as new features'''
    category_lst = reduce(lambda x,y: x + y, category.values(),[])
#     target_category = [i for i in set(category_lst) if 'Museum' in i \
#                        or 'Galleries' in i or 'Historic Sites' in i or 'Landmarks' in i]
#     # print target_category
#     df = assign_0_or_1(df, target_category, category)
    lst = list(set(category_lst))
    lst.sort()
    df = assign_0_or_1(df, lst, category)
    return df

def get_tag_cloud_lst(tag_cloud):
    '''get tag cloud list: [(tag name, its frequency)]'''
    tag_cloud_lst = reduce(lambda x,y: x + y, tag_cloud.values(),[])
    freq_dic = {}
    for i in tag_cloud_lst:
        if i not in freq_dic.keys(): 
            freq_dic[i] = 1
        else:
            freq_dic[i] +=1
    tag_freq_lst = freq_dic.items()
    tag_freq_lst.sort(key = lambda x: x[1], reverse=True)
    return tag_freq_lst

def add_tags_feature(df, tag_cloud, n):
    '''add top n tags as features'''
    tag_freq_lst = get_tag_cloud_lst(tag_cloud)
    target_tags = map(lambda x: x[0], tag_freq_lst[0:n])
    df = assign_0_or_1(df, target_tags, tag_cloud)    
    return df    

def add_traveler_type_count(df):
    '''add review count for each traveler type as new feature'''
    df = df.copy(deep=True)
    type_of_traveler = ['Families_Count', 'Couples_Count', 'Solo_Count', 'Business_Count', 'Friends_Count']
    for idx, item in enumerate(type_of_traveler):    
        for museum_name, value in traveler_type.items():
            # get index
            row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
            # assign review count for this traveler type 
            df.loc[row_idx, item] = traveler_type[museum_name][idx]
    return df

def create_description_dic(df):
    '''create dictionary --> {museum name: museum description}'''
    description_dic = {}
    for i in df.index:
        description_dic[df['MuseumName'][i]] = df['Description'][i]
    return description_dic

def get_nested_sentiment(dic):
    '''get polarity and subjectivity score for each text in nested list'''
    polarity_dic = {}
    subjectivity_dic = {}
    for museum_name, lst in dic.items():
        polarity_lst = []
        subjectivity_lst = []
        for sentence in lst:
            blob = TextBlob(sentence)
            polarity_lst.append(blob.sentiment.polarity)
            subjectivity_lst.append(blob.sentiment.subjectivity)
        polarity_dic[museum_name] = polarity_lst
        subjectivity_dic[museum_name] = subjectivity_lst
    return polarity_dic, subjectivity_dic

def get_text_sentiment(dic):
    '''get polarity and subjectivity score for text passing in'''
    polarity_dic = {}
    subjectivity_dic = {}
    for museum_name, text in dic.items():
        try:
            blob = TextBlob(text)
            polarity_dic[museum_name] = blob.sentiment.polarity
            subjectivity_dic[museum_name] = blob.sentiment.subjectivity
        except:
            polarity_dic[museum_name] = float('NaN')
            subjectivity_dic[museum_name] = float('NaN')
    return polarity_dic, subjectivity_dic

def add_multiple_score_feature(df, header, dic):
    '''add max, min, mean, and var of sentiment scores as new features'''
    df = df.copy(deep=True)   
    colnames = [header+'var', header+'mean', header+'max', header+'min']
    for museum_name, score_lst in dic.items():
        # get index
        row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
        # create new features for scores: max, min, mean, and var of score_lst
        df.loc[row_idx, colnames[0]] = np.var(score_lst)
        df.loc[row_idx, colnames[1]] = np.mean(score_lst)
        df.loc[row_idx, colnames[2]] = max(score_lst)
        df.loc[row_idx, colnames[3]] = min(score_lst)
    return df

def add_one_feature(df, colname, dic):
    '''add the value in dic as new feature to df with colname as new column name'''
    df = df.copy(deep=True) 
    for museum_name, value in dic.items():
        # get index
        row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
        # assign score
        df.loc[row_idx, colname] = value
    return df

def get_precise_rating(df, traveler_rating):
    '''create a dictionary with precise rating score'''
    precise_rating = {}
    for museum_name, rating_lst in traveler_rating.items():
        # if the rating list is [13, 10, 3, 4, 2]
        # then the rating sum is 13*5 + 10*4 + 3*3 + 4*2 + 2*1
        rating_sum = reduce(lambda x,y: x+y , [float(val)*(5-idx) for idx, val in enumerate(rating_lst)])
        # #get review count of the museum
        # review_count = merged_df.loc[df[df['MuseumName'] == museum_name].index.tolist()[0], 'ReviewCount']
        # omg.. the ReviewCount is not the same as number of traveler_rating...
        review_count = reduce(lambda x,y: x+y , [float(val) for val in rating_lst])
        # calculate precise rating
        precise_rating[museum_name] = rating_sum/review_count
    return precise_rating

### Main Code

In [11]:
#################
### Read Data ###
#################

# read musuem data (.csv)
# usa
us_museum = pd.read_csv("./Data/tripadvisor_museum_USonly.csv")
us_category = read_json('./Data/museum_categories_USonly.json')
us_review = read_json('./Data/review_content_USonly.json')
us_quote = read_json('./Data/review_quote_USonly.json')
us_tag_cloud = read_json('./Data/tag_clouds_USonly.json')
us_traveler_type = read_json('./Data/traverler_type_USonly.json')
us_traveler_rating = read_json('./Data/traverler_rating_USonly.json')

# world
w_museum = pd.read_csv("./Data/tripadvisor_museum_world.csv")
w_category = read_json('./Data/museum_categories_world.json')
w_review = read_json('./Data/review_content_world.json')
w_quote = read_json('./Data/review_quote_world.json')
w_tag_cloud = read_json('./Data/tag_clouds_world.json')
w_traveler_type = read_json('./Data/traverler_type_world.json')
w_traveler_rating = read_json('./Data/traverler_rating_world.json')

# read USA state full/abbr name
state_name = pd.read_csv("./Data/states.csv")
state_name_lst = state_name['Abbreviation'].tolist()

print 'museum columns names:', us_museum.columns.values 
# an unknown first column is in the dataframe
# thus we drop the first column later on

# discover that....the value in lengthOfVisit has value should be in column 'Fee'....
print 'Unique lengthOfVisit:', set(us_museum['LengthOfVisit'])

######################################################
### Data Merging, Cleaning and Adding New Features ###
######################################################

# drop the first column
us_museum, w_museum = drop_first_column(us_museum, w_museum)

# merge usa and world museum
merged_df = merge_usa_world(us_museum, w_museum)
nrows = merged_df.shape[0]

# fix the lengthOfVsiit and fee value
# merged_df = fix_fee_value(merged_df)
# merged_df = merged_df.drop(merged_df.index[range(nrows, merged_df.shape[0])]) # drop exrta rows created
# not sure why the function "fix_fee_value" wont work 
idx_no = merged_df[merged_df['LengthOfVisit'] == 'No '].index.tolist()
idx_yes = merged_df[merged_df['LengthOfVisit'] == 'Yes '].index.tolist()
df = merged_df.copy(deep = True)
for idx in idx_no:
    df.loc[idx,'LengthOfVisit'] = None
for idx in idx_yes:
    df.loc[idx,'LengthOfVisit'] = None
merged_df = df.copy(deep = True)
print 'Unique LengthOfVisit:', set(merged_df['LengthOfVisit'])
print 'df shape:',merged_df.shape

# convert string to number
merged_df = string_to_num(merged_df, ['ReviewCount','TotalThingsToDo'])

# add new feature "Country": USA or Other & "State": state name or Not_in_USA
add_country_state(merged_df, state_name_lst)
merged_df = merged_df.drop(merged_df.index[range(nrows, merged_df.shape[0])]) # drop exrta rows created

# add feature 'RankPercentage'
merged_df['RankPercentage'] = merged_df['Rank']*100/merged_df['TotalThingsToDo'] 
print 'df shape:', merged_df.shape # check the shape

print 'handling nested lists...'
# combind USA and world dictionaries
category = append_two_dict(us_category, w_category)
review = append_two_dict(us_review, w_review)
quote = append_two_dict(us_quote, w_quote)
tag_cloud = append_two_dict(us_tag_cloud, w_tag_cloud)
traveler_type = append_two_dict(us_traveler_type, w_traveler_type)
traveler_rating = append_two_dict(us_traveler_rating, w_traveler_rating)

# convert all dictionaries from unicode to ascii
category = encode_whole_dictionary(category)
review = encode_whole_dictionary(review)
quote = encode_whole_dictionary(quote)
tag_cloud = encode_whole_dictionary(tag_cloud)
traveler_type = encode_whole_dictionary(traveler_type)
traveler_rating = encode_whole_dictionary(traveler_rating)

# convert strings in dictionary to number
traveler_type = dic_str_to_num(traveler_type)
traveler_rating = dic_str_to_num(traveler_rating)

# add text-based features by assigning 1 or 0 
# (if a museum matches or doesn't match the criteria)
print 'adding museum category features...'
merged_df = add_category_feature(merged_df, category)
print 'adding museum tag features...'
merged_df = add_tags_feature(merged_df, tag_cloud, 100)
print 'adding traveler type features...'
merged_df = add_traveler_type_count(merged_df)
# merged_df = string_to_num(merged_df, ['Families_Count','Couples_Count', 'Solo_Count', 'Business_Count', 'Friends_Count'])

# calculate precise rating and add it as new column
# yet to decide whether I should predict the displayed rating for precise rating...
print 'adding precise rating feature...'
precise_rating_dic = get_precise_rating(merged_df, traveler_rating)
merged_df = add_one_feature(merged_df, 'PreciseRating', precise_rating_dic)

#########################################################
### Add More New Features - Sentiment Analysis Scores ###
#########################################################
print 'getting sentiment analysis scores...'
# create description dictionary {museum name: description}
description_dic = create_description_dic(merged_df)

# get sentiment scores for quote/review/description
quote_polarity_dic, quote_subjectivity_dic = get_nested_sentiment(quote)
review_polarity_dic, review_subjectivity_dic = get_nested_sentiment(review)
des_polarity_dic, des_subjectivity_dic = get_text_sentiment(description_dic)

print 'adding sentiment analysis scores as new features...'
# add sentiment scores of quote/review/description as new features
merged_df = add_multiple_score_feature(merged_df, 'quote_pol_', quote_polarity_dic)
merged_df = add_multiple_score_feature(merged_df, 'quote_sub_', quote_subjectivity_dic)
merged_df = add_multiple_score_feature(merged_df, 'review_pol_', review_polarity_dic)
merged_df = add_multiple_score_feature(merged_df, 'review_sub_', review_subjectivity_dic)

# somehow the dictionary has a float('NaN') as key... remove the key from dictionary
des_polarity_dic = clean_dic(des_polarity_dic)
des_subjectivity_dic = clean_dic(des_subjectivity_dic)

# add sentiment score of museum description as new features
merged_df = add_one_feature(merged_df, 'descri_pol', des_polarity_dic)
merged_df = add_one_feature(merged_df, 'descri_sub', des_subjectivity_dic)
merged_df = merged_df.drop(merged_df.index[range(nrows, merged_df.shape[0])]) # drop exrta rows created
print 'df shape:', merged_df.shape
print 'Columns of merged features:', merged_df.columns.values

#########################
### write file to csv ###
#########################
print 'writing merged dataframe into csv...'
merged_df.to_csv('./app/data/tripadvisor_merged.csv')
print 'done! :D'

museum columns names: ['Unnamed: 0' 'Address' 'Description' 'FeatureCount' 'Fee' 'Langtitude'
 'Latitude' 'LengthOfVisit' 'MuseumName' 'PhoneNum' 'Rank' 'Rating'
 'ReviewCount' 'TotalThingsToDo']
Unique lengthOfVisit: set([nan, '2-3 hours ', 'No ', 'Yes ', 'More than 3 hours ', '<1 hour ', '1-2 hours '])
Unique LengthOfVisit: set([nan, '2-3 hours ', '<1 hour ', None, 'More than 3 hours ', '1-2 hours '])
df shape: (1603, 13)
df shape: (1603, 16)
handling nested lists...
adding museum category features...
adding museum tag features...
adding traveler type features...
adding precise rating feature...
getting sentiment analysis scores...
adding sentiment analysis scores as new features...
df shape: (1603, 223)
Columns of merged features: ['Address' 'Description' 'FeatureCount' 'Fee' 'Langtitude' 'Latitude'
 'LengthOfVisit' 'MuseumName' 'PhoneNum' 'Rank' 'Rating' 'ReviewCount'
 'TotalThingsToDo' 'Country' 'State' 'RankPercentage' 'Ancient Ruins'
 'Architectural Buildings' 'Arenas & Stadiums

### Dummify Categorical Variables

In [12]:
museum = pd.read_csv("./app/data/tripadvisor_merged.csv")
# Dummify categorical variables
length = pd.get_dummies(museum['LengthOfVisit'], prefix='LengthOfVisit_')
contry = pd.get_dummies(museum['Country'], prefix='Country_')
fee = pd.get_dummies(museum['Fee'], prefix='Fee_')
m_dum = pd.concat([museum,length,contry,fee],axis=1)
m_dum.to_csv("dummified_df.csv", index = False)
m_dum.head()

Unnamed: 0.1,Unnamed: 0,Address,Description,FeatureCount,Fee,Langtitude,Latitude,LengthOfVisit,MuseumName,PhoneNum,...,descri_pol,descri_sub,LengthOfVisit__1-2 hours,LengthOfVisit__2-3 hours,LengthOfVisit__<1 hour,LengthOfVisit__More than 3 hours,Country__Other,Country__USA,Fee__No,Fee__Yes
0,0,"555 Pennsylvania Ave NW, Washington DC, DC 200...",Find out for yourself why everyone is calling ...,3.0,Yes,-77.019235,38.893138,2-3 hours,Newseum,+1 888-639-7386,...,0.49,0.506667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,4,"1000 5th Ave, New York City, NY 10028-0198",At New York City's most visited museum and att...,12.0,Yes,-73.962928,40.779166,2-3 hours,The Metropolitan Museum of Art,1 212-535-7710,...,0.318182,0.477273,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,5,"945 Magazine Street, New Orleans, LA 70130-3813","Founded by historian and author, Stephen Ambro...",11.0,,-90.070086,29.943004,,The National WWII Museum,+1 504-528-1944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,6,"2001 N Colorado Blvd, Denver, CO 80205-5798",The Denver Museum of Nature & Science is the R...,0.0,,-104.94102,39.769189,,Denver Museum of Nature & Science,303-370-6000,...,0.1,0.4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7,"111 S Michigan Ave, Chicago, IL 60603-6488","This Classical Renaissance structure, guarded ...",5.0,Yes,-87.623724,41.879547,More than 3 hours,Art Institute of Chicago,312 443 3600,...,0.3,0.3375,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


### KNN Imputation

In [4]:
museum_df = pd.read_csv("dummified_df.csv")
museum_df.columns.tolist()
idx_to_drop = [0,1,2,4,5,6,7,9,14,15]
museum_df = museum_df.drop(museum_df.columns[idx_to_drop], axis=1)
museum_df.columns.tolist()
# impute missing value using knn imputation
### k was determined by the one yielding decent prediction for classification 'Rating' in 'TripAdvisor_Rating_Prediction.ipynb'
### but I changed my mind to imput with k = sqrt(n) which is 40
no_name_df = museum_df.drop(museum_df.columns[[1]], axis = 1)
X_filled_knn = KNN(k = 40).complete(no_name_df)
length = no_name_df.shape[0]
imputed_df = pd.DataFrame(data = X_filled_knn,
                          index= range(0,length),
                          columns = no_name_df.columns)
# marge museum name back to imputed dataframe
merged_df = pd.concat([museum_df['MuseumName'], imputed_df], axis=1)
# write imputed dataframe into .csv
merged_df.to_csv('./app/data/imputed_df_with_name.csv')

Computing pairwise distances between 1603 samples
Computing distances for sample #1/1603, elapsed time: 0.442
Computing distances for sample #101/1603, elapsed time: 0.617
Computing distances for sample #201/1603, elapsed time: 0.780
Computing distances for sample #301/1603, elapsed time: 0.911
Computing distances for sample #401/1603, elapsed time: 1.065
Computing distances for sample #501/1603, elapsed time: 1.214
Computing distances for sample #601/1603, elapsed time: 1.357
Computing distances for sample #701/1603, elapsed time: 1.479
Computing distances for sample #801/1603, elapsed time: 1.619
Computing distances for sample #901/1603, elapsed time: 1.750
Computing distances for sample #1001/1603, elapsed time: 1.886
Computing distances for sample #1101/1603, elapsed time: 2.010
Computing distances for sample #1201/1603, elapsed time: 2.147
Computing distances for sample #1301/1603, elapsed time: 2.282
Computing distances for sample #1401/1603, elapsed time: 2.419
Computing distanc

In [5]:
# before imputation
museum_df.describe()



Unnamed: 0,FeatureCount,Rank,Rating,ReviewCount,TotalThingsToDo,RankPercentage,Ancient Ruins,Architectural Buildings,Arenas & Stadiums,Art Galleries,...,descri_pol,descri_sub,LengthOfVisit__1-2 hours,LengthOfVisit__2-3 hours,LengthOfVisit__<1 hour,LengthOfVisit__More than 3 hours,Country__Other,Country__USA,Fee__No,Fee__Yes
count,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,...,1161.0,1161.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0
mean,1.007486,16.555209,4.4267,1540.467249,243.429195,9.297117,0.001248,0.006862,0.001871,0.006862,...,0.179419,0.397966,0.149719,0.123518,0.021834,0.054273,0.388022,0.611978,0.032439,0.115409
std,2.149806,28.900707,0.285016,3906.633803,308.115561,10.134265,0.035311,0.082579,0.043234,0.082579,...,0.18551,0.221829,0.356907,0.329134,0.146187,0.226627,0.487452,0.487452,0.177219,0.319614
min,0.0,1.0,2.5,32.0,1.0,0.0693,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,4.5,213.0,48.0,2.941176,0.0,0.0,0.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,6.0,4.5,510.0,142.0,6.25,0.0,0.0,0.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,1.0,19.0,4.5,1257.0,314.5,12.5,0.0,0.0,0.0,0.0,...,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,27.0,397.0,5.0,63112.0,2279.0,100.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# after imputation
merged_df.describe()

Unnamed: 0,FeatureCount,Rank,Rating,ReviewCount,TotalThingsToDo,RankPercentage,Ancient Ruins,Architectural Buildings,Arenas & Stadiums,Art Galleries,...,descri_pol,descri_sub,LengthOfVisit__1-2 hours,LengthOfVisit__2-3 hours,LengthOfVisit__<1 hour,LengthOfVisit__More than 3 hours,Country__Other,Country__USA,Fee__No,Fee__Yes
count,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,...,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0
mean,1.007486,16.555209,4.4267,1540.467249,243.429195,9.297117,0.001248,0.006862,0.001871,0.006862,...,0.17722,0.39355,0.149719,0.123518,0.021834,0.054273,0.388022,0.611978,0.032439,0.115409
std,2.149806,28.900707,0.285016,3906.633803,308.115561,10.134265,0.035311,0.082579,0.043234,0.082579,...,0.159632,0.190507,0.356907,0.329134,0.146187,0.226627,0.487452,0.487452,0.177219,0.319614
min,0.0,1.0,2.5,32.0,1.0,0.0693,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,4.5,213.0,48.0,2.941176,0.0,0.0,0.0,0.0,...,0.095239,0.310554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,6.0,4.5,510.0,142.0,6.25,0.0,0.0,0.0,0.0,...,0.166667,0.38926,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,1.0,19.0,4.5,1257.0,314.5,12.5,0.0,0.0,0.0,0.0,...,0.24556,0.484623,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,27.0,397.0,5.0,63112.0,2279.0,100.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
merged_df.describe().to_csv('summary_after_knn_imputation.csv')

### Miscellaneous Note/Code

In [63]:
'Feature Name'.replace(' ', '_')

'Feature_Name'

In [30]:
# data exploratory use
tag_freq_lst = get_tag_cloud_lst(tag_cloud)
target_tags = tag_freq_lst[0:30]
target_tags

[('on display', 792),
 ('gift shop', 415),
 ('rainy day', 300),
 ('couple of hours', 289),
 ('all ages', 287),
 ('special exhibits', 186),
 ('few hours', 180),
 ('two hours', 179),
 ('exhibits', 166),
 ('well worth a visit', 165),
 ('permanent collection', 164),
 ('worth a visit', 161),
 ('free admission', 154),
 ('audio guide', 153),
 ('beautiful building', 138),
 ('great collection', 129),
 ('information', 127),
 ('interesting exhibits', 127),
 ('entrance fee', 121),
 ('great for kids', 120),
 ('interactive exhibits', 117),
 ('hands on activities', 115),
 ('great exhibits', 110),
 ('great place to visit', 99),
 ('interactive displays', 94),
 ('guided tour', 92),
 ('well worth the visit', 91),
 ('whole family', 83),
 ('kids and adults', 83),
 ('traveling exhibits', 82)]

In [40]:
# However...I am not sure why precise ratings are so different from
# the ratings displayed on Tripadvisor (maybe the metric is different?)
test_df = pd.DataFrame({'precise_rating':merged_df['PreciseRating'], 
                        'rating': merged_df['Rating'], 
                        'museum': merged_df['MuseumName']})
test_df.head(10)

Unnamed: 0,museum,precise_rating,rating
0,Newseum,4.471866,4.5
4,The Metropolitan Museum of Art,2.747563,5.0
5,The National WWII Museum,4.6031,5.0
6,Denver Museum of Nature & Science,4.450858,4.5
7,Art Institute of Chicago,4.140742,5.0
8,The National 9/11 Memorial & Museum,3.138469,4.5
9,Museum of Fine Arts,0.462956,4.5
14,The Field Museum,3.869455,4.5
19,Smithsonian National Museum of Natural History,3.783377,4.5
24,Smithsonian National Air and Space Museum,3.472054,4.5


In [39]:
target_tags = tag_freq_lst[0:60]
target_tags

[('on display', 792),
 ('gift shop', 415),
 ('rainy day', 300),
 ('couple of hours', 289),
 ('all ages', 287),
 ('special exhibits', 186),
 ('few hours', 180),
 ('two hours', 179),
 ('exhibits', 166),
 ('well worth a visit', 165),
 ('permanent collection', 164),
 ('worth a visit', 161),
 ('free admission', 154),
 ('audio guide', 153),
 ('beautiful building', 138),
 ('great collection', 129),
 ('information', 127),
 ('interesting exhibits', 127),
 ('entrance fee', 121),
 ('great for kids', 120),
 ('interactive exhibits', 117),
 ('hands on activities', 115),
 ('great exhibits', 110),
 ('great place to visit', 99),
 ('interactive displays', 94),
 ('guided tour', 92),
 ('well worth the visit', 91),
 ('whole family', 83),
 ('kids and adults', 83),
 ('traveling exhibits', 82),
 ('his life', 82),
 ('local history', 77),
 ('amazing collection', 77),
 ('free entry', 74),
 ('worth the trip', 74),
 ('great history', 73),
 ('civil war', 72),
 ('take your time', 71),
 ('nice collection', 70),
 ('in

In [40]:
len(tag_freq_lst)

10889

In [58]:
merged_df.head()

Unnamed: 0,Address,Description,FeatureCount,Fee,Langtitude,Latitude,LengthOfVisit,MuseumName,PhoneNum,Rank,...,review_pol_var,review_pol_mean,review_pol_max,review_pol_min,review_sub_var,review_sub_mean,review_sub_max,review_sub_min,descri_pol,descri_sub
0,"555 Pennsylvania Ave NW, Washington DC, DC 200...",Find out for yourself why everyone is calling ...,3.0,Yes,-77.019235,38.893138,2-3 hours,Newseum,+1 888-639-7386,8.0,...,0.02376,0.334677,0.611111,0.0,0.014521,0.509036,0.754861,0.342857,0.49,0.506667
4,"1000 5th Ave, New York City, NY 10028-0198",At New York City's most visited museum and att...,12.0,Yes,-73.962928,40.779166,2-3 hours,The Metropolitan Museum of Art,1 212-535-7710,2.0,...,0.018057,0.318339,0.634375,0.127083,0.01828,0.537971,0.7,0.291667,0.318182,0.477273
5,"945 Magazine Street, New Orleans, LA 70130-3813","Founded by historian and author, Stephen Ambro...",11.0,,-90.070086,29.943004,,The National WWII Museum,+1 504-528-1944,1.0,...,0.010222,0.333493,0.5,0.140783,0.0191,0.508205,0.804861,0.333333,0.0,0.0
6,"2001 N Colorado Blvd, Denver, CO 80205-5798",The Denver Museum of Nature & Science is the R...,0.0,,-104.94102,39.769189,,Denver Museum of Nature & Science,303-370-6000,4.0,...,0.022625,0.429964,0.664286,0.192262,0.014845,0.584561,0.783333,0.438095,0.1,0.4
7,"111 S Michigan Ave, Chicago, IL 60603-6488","This Classical Renaissance structure, guarded ...",5.0,Yes,-87.623724,41.879547,More than 3 hours,Art Institute of Chicago,312 443 3600,1.0,...,0.030203,0.299117,0.681333,0.125,0.02538,0.530728,0.788333,0.266667,0.3,0.3375
