# Appendix B

In [1]:
# import necessary packages
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

In [None]:
# read in the raw data
raw_train = pd.read_csv("data/train.csv", parse_dates=['host_since', 'first_review', 'last_review'])
raw_test = pd.read_csv("data/test.csv", parse_dates=['host_since', 'first_review', 'last_review'])

## Model 1 Feature Engineering
### Training Dataset

In [None]:
# features that are ready to go out of the box
good_to_go_train = raw_train[['host_total_listings_count', 'calculated_host_listings_count',
                              'accommodates',
                              'availability_30', 'availability_60', 'availability_90', 'availability_365',
                              'minimum_nights', 'maximum_nights',
                              'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d']].copy()

# features that require transformation from boolean to indicator
bools_train = raw_train[['host_is_superhost', "host_has_profile_pic", 'host_identity_verified', 'has_availability', 'instant_bookable']].copy()
bools_train.loc[bools_train["host_is_superhost"].isna(), 'host_is_superhost'] = False
bools_train.loc[bools_train["has_availability"].isna(), 'has_availability'] = False
bools_train = bools_train.astype(int)

# coordinate and rotated coordinate features
# Creating the rotated version such that Manhattan is exactly vertical
# so trees can roughly split along streets and avenues
coords_train = raw_train[['longitude', 'latitude']].copy()
# rotation matrix
theta = np.radians(32)
rotation_matrix = np.array([
    [np.cos(theta), -np.sin(theta)],
    [np.sin(theta), np.cos(theta)]
])

coords = coords_train[['longitude', 'latitude']].values
rotated_coords = coords @ rotation_matrix.T
coords_train['Rotated Longitude'] = rotated_coords[:, 0]
coords_train['Rotate Latitude'] = rotated_coords[:, 1]

# date variables (transform to "days since")
dates_train = (raw_train["host_since"].max() - raw_train["host_since"]).dt.days

# create dummies for categorical vairables
categorical_train = raw_train[['neighbourhood_group_cleansed', 'host_response_time', 'room_type']].copy()
categorical_train = pd.get_dummies(categorical_train, prefix=["Neighborhood Group", "Response Time", "Room Type"],
                                   prefix_sep = " : ", dummy_na=True, drop_first=False).drop(columns=["Neighborhood Group : nan", "Room Type : nan"]).astype(int)

# Create Indicators for Amenities that appear more than 500 times in the training set
amenities_as_lists = raw_train['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))
# dictionary of amenities and their counts
unique_amenities = {}

for list in amenities_as_lists:
    for item in list:
        if item in unique_amenities:
            unique_amenities[item] = unique_amenities[item] + 1
        else:
            unique_amenities[item] = 1
# amenities appearing more than 500 times
amenitities_to_dummy = [amenity for amenity, count in unique_amenities.items() if count >= 500]

amenity_train = pd.DataFrame({f'Amenity : {amenity}': amenities_as_lists.apply(lambda x: amenity in x) for amenity in amenitities_to_dummy}).astype(int)
amenity_train["Total Amenities"] = amenities_as_lists.apply(lambda x : len(x))

# features that require imputation and other wrangling
wrangle_train = raw_train[["host_response_rate", "host_acceptance_rate", "bathrooms", "bedrooms", "beds"]].copy()
# impute missingness as its own value
wrangle_train.loc[wrangle_train["host_response_rate"].isna(), "host_response_rate"] = -1
wrangle_train.loc[wrangle_train["host_acceptance_rate"].isna(), "host_acceptance_rate"] = -1
# impute the mode
wrangle_train.loc[wrangle_train["bathrooms"].isna(), "bathrooms"] = 1
wrangle_train.loc[wrangle_train["bedrooms"].isna(), "bedrooms"] = 1
wrangle_train.loc[wrangle_train["beds"].isna(), "beds"] = 1
# Create ratios from calculated counts fields
wrangle_train["Shared Baths"] = raw_train["bathrooms_text"].apply(lambda x : "shared" in str(x).lower()).astype(int)
wrangle_train["Calculated Host Proportion : Entire Homes/Apts"] = raw_train['calculated_host_listings_count_entire_homes'] / raw_train['calculated_host_listings_count']
wrangle_train["Calculated Host Proportion : Private Rooms"] = raw_train['calculated_host_listings_count_private_rooms'] / raw_train['calculated_host_listings_count']
wrangle_train["Calculated Host Proportion : Shared Rooms"] = raw_train['calculated_host_listings_count_shared_rooms'] / raw_train['calculated_host_listings_count']

# join for final data frame
model1_train = pd.concat([good_to_go_train, bools_train, coords_train, dates_train, categorical_train, amenity_train, wrangle_train], axis=1)

### Testing Dataset

In [None]:
# features that are ready to go out of the box
good_to_go_test = raw_test[['host_total_listings_count', 'calculated_host_listings_count',
                              'accommodates',
                              'availability_30', 'availability_60', 'availability_90', 'availability_365',
                              'minimum_nights', 'maximum_nights',
                              'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d']].copy()

# features that require transformation from boolean to indicator
bools_test = raw_test[['host_is_superhost', "host_has_profile_pic", 'host_identity_verified', 'has_availability', 'instant_bookable']].copy()
bools_test.loc[bools_test["host_is_superhost"].isna(), 'host_is_superhost'] = False
bools_test.loc[bools_test["has_availability"].isna(), 'has_availability'] = False
bools_test = bools_test.astype(int)

# coordinate and rotated coordinate features
# Creating the rotated version such that Manhattan is exactly vertical
# so trees can roughly split along streets and avenues
coords_test = raw_test[['longitude', 'latitude']].copy()
# rotation matrix
theta = np.radians(32)
rotation_matrix = np.array([
    [np.cos(theta), -np.sin(theta)],
    [np.sin(theta), np.cos(theta)]
])

coords = coords_test[['longitude', 'latitude']].values
rotated_coords = coords @ rotation_matrix.T
coords_test['Rotated Longitude'] = rotated_coords[:, 0]
coords_test['Rotate Latitude'] = rotated_coords[:, 1]

# date variables (transform to "days since")
dates_test = (raw_train["host_since"].max() - raw_test["host_since"]).dt.days

# create dummies for categorical vairables
categorical_test = raw_test[['neighbourhood_group_cleansed', 'host_response_time', 'room_type']].copy()
categorical_test = pd.get_dummies(categorical_test, prefix=["Neighborhood Group", "Response Time", "Room Type"],
                                   prefix_sep = " : ", dummy_na=True, drop_first=False).drop(columns=["Neighborhood Group : nan", "Room Type : nan"]).astype(int)

# Create Indicators for Amenities that appear more than 500 times in training set
amenities_as_lists = raw_test['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))

amenity_test = pd.DataFrame({f'Amenity : {amenity}': amenities_as_lists.apply(lambda x: amenity in x) for amenity in amenitities_to_dummy}).astype(int)
amenity_test["Total Amenities"] = amenities_as_lists.apply(lambda x : len(x))

# features that require imputation and other wrangling
wrangle_test = raw_test[["host_response_rate", "host_acceptance_rate", "bathrooms", "bedrooms", "beds"]].copy()
# impute missingness as its own value
wrangle_test.loc[wrangle_test["host_response_rate"].isna(), "host_response_rate"] = -1
wrangle_test.loc[wrangle_test["host_acceptance_rate"].isna(), "host_acceptance_rate"] = -1
# impute the mode
wrangle_test.loc[wrangle_test["bathrooms"].isna(), "bathrooms"] = 1
wrangle_test.loc[wrangle_test["bedrooms"].isna(), "bedrooms"] = 1
wrangle_test.loc[wrangle_test["beds"].isna(), "beds"] = 1
# Create ratios from calculated counts fields
wrangle_test["Shared Baths"] = raw_test["bathrooms_text"].apply(lambda x : "shared" in str(x).lower()).astype(int)
wrangle_test["Calculated Host Proportion : Entire Homes/Apts"] = raw_test['calculated_host_listings_count_entire_homes'] / raw_test['calculated_host_listings_count']
wrangle_test["Calculated Host Proportion : Private Rooms"] = raw_test['calculated_host_listings_count_private_rooms'] / raw_test['calculated_host_listings_count']
wrangle_test["Calculated Host Proportion : Shared Rooms"] = raw_test['calculated_host_listings_count_shared_rooms'] / raw_test['calculated_host_listings_count']

# join for final data frame
model1_test = pd.concat([good_to_go_test, bools_test, coords_test, dates_test, categorical_test, amenity_test, wrangle_test], axis=1)

### Normalization and Export

In [None]:
# make variable names more readable
model1_train.columns = [x.replace("_", " ") for x in model1_train.columns]
model1_test.columns = [x.replace("_", " ") for x in model1_test.columns]

model1_train_norm = model1_train.copy()
model1_test_norm = model1_test.copy()
# min/max standardization
for col in model1_train.columns:
    model1_train_norm[col] = model1_train_norm[col] - model1_train[col].min()
    model1_train_norm[col] = model1_train_norm[col] / (model1_train[col].max() - model1_train[col].min())

    model1_test_norm[col] = model1_test_norm[col] - model1_train[col].min()
    model1_test_norm[col] = model1_test_norm[col] / (model1_train[col].max() - model1_train[col].min())
# export 
model1_train_norm.to_csv('data/model1_training_features.csv', index=False)
model1_test_norm.to_csv('data/model1_testing_features.csv', index=False)

## Model 2 Feature Engineering
### Training Dataset

In [None]:
# features that are ready to go out of the box
good_to_go_train = raw_train[['host_total_listings_count', 'calculated_host_listings_count',
                              'accommodates',
                              'availability_30', 'availability_60', 'availability_90', 'availability_365',
                              'minimum_nights', 'maximum_nights',
                              'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d']].copy()

# features that require transformation from boolean to indicator
bools_train = raw_train[['host_is_superhost', "host_has_profile_pic", 'host_identity_verified', 'has_availability', 'instant_bookable']].copy()
bools_train.loc[bools_train["host_is_superhost"].isna(), 'host_is_superhost'] = False
bools_train.loc[bools_train["has_availability"].isna(), 'has_availability'] = False
bools_train = bools_train.astype(int)

# coordinate and rotated coordinate features
# Creating the rotated version such that Manhattan is exactly vertical
# so trees can roughly split along streets and avenues
coords_train = raw_train[['longitude', 'latitude']].copy()

theta = np.radians(32)
rotation_matrix = np.array([
    [np.cos(theta), -np.sin(theta)],
    [np.sin(theta), np.cos(theta)]
])

coords = coords_train[['longitude', 'latitude']].values
rotated_coords = coords @ rotation_matrix.T
coords_train['Rotated Longitude'] = rotated_coords[:, 0]-
coords_train['Rotate Latitude'] = rotated_coords[:, 1]

# date variables (transform to "days since")
dates_train = (raw_train["host_since"].max() - raw_train["host_since"]).dt.days

# create dummies for categorical vairables
# Filter for neighborhoods wth over 20 properties in the training dataset to create dummy varaibles
hoods = raw_train["neighbourhood_cleansed"].value_counts()
hoods_to_dummy = (hoods[hoods >= 20].index).tolist()
# Filter for property types with more than 20 properties in the training dataset to create dummy variables
prop_types = raw_train["property_type"].value_counts()
prop_types_to_dummy = (prop_types[prop_types >= 20].index).tolist()

categorical_train = raw_train[['property_type', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'host_response_time', 'room_type']].copy()
categorical_train.loc[categorical_train["property_type"].apply(lambda x : x not in prop_types_to_dummy), "property_type"] = "Other"
categorical_train.loc[categorical_train["neighbourhood_cleansed"].apply(lambda x : x not in hoods_to_dummy), "neighbourhood_cleansed"] = "Other"
categorical_train = pd.get_dummies(categorical_train, prefix=["Property Type", "Neighborhood", "Neighborhood Group", "Response Time", "Room Type"],
                                   prefix_sep = " : ", dummy_na=True, drop_first=False).drop(columns=["Property Type : nan",
                                                                                                      "Neighborhood : nan",
                                                                                                      "Neighborhood Group : nan",
                                                                                                      "Room Type : nan"]).astype(int)

# Create Indicators for Amenities that appear more than 100 times in training set
amenities_as_lists = raw_train['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))

unique_amenities = {}

for list in amenities_as_lists:
    for item in list:
        if item in unique_amenities:
            unique_amenities[item] = unique_amenities[item] + 1
        else:
            unique_amenities[item] = 1

amenitities_to_dummy = [amenity for amenity, count in unique_amenities.items() if count >= 100]

amenity_train = pd.DataFrame({f'Amenity : {amenity}': amenities_as_lists.apply(lambda x: amenity in x) for amenity in amenitities_to_dummy}).astype(int)
amenity_train["Total Amenities"] = amenities_as_lists.apply(lambda x : len(x))

# features that require imputation and other wrangling
wrangle_train = raw_train[["host_response_rate", "host_acceptance_rate", "bathrooms", "bedrooms", "beds"]].copy()
# impute missingness as its own value
wrangle_train.loc[wrangle_train["host_response_rate"].isna(), "host_response_rate"] = -1
wrangle_train.loc[wrangle_train["host_acceptance_rate"].isna(), "host_acceptance_rate"] = -1
# impute the mode
wrangle_train.loc[wrangle_train["bathrooms"].isna(), "bathrooms"] = 1
wrangle_train.loc[wrangle_train["bedrooms"].isna(), "bedrooms"] = 1
wrangle_train.loc[wrangle_train["beds"].isna(), "beds"] = 1
# Create ratios from calculated counts fields
wrangle_train["Shared Baths"] = raw_train["bathrooms_text"].apply(lambda x : "shared" in str(x).lower()).astype(int)
wrangle_train["Calculated Host Proportion : Entire Homes/Apts"] = raw_train['calculated_host_listings_count_entire_homes'] / raw_train['calculated_host_listings_count']
wrangle_train["Calculated Host Proportion : Private Rooms"] = raw_train['calculated_host_listings_count_private_rooms'] / raw_train['calculated_host_listings_count']
wrangle_train["Calculated Host Proportion : Shared Rooms"] = raw_train['calculated_host_listings_count_shared_rooms'] / raw_train['calculated_host_listings_count']

# Curate All review Data
review_train = raw_train[['first_review', 'last_review',
                          'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']].copy()
# impute min/max value and take ""days since"
review_train.loc[review_train["first_review"].isna(), 'first_review'] = raw_train["first_review"].max()
review_train.loc[review_train["last_review"].isna(), 'last_review'] = raw_train["last_review"].min()
review_train["first_review"] = (raw_train["first_review"].max() - review_train["first_review"]).dt.days
review_train["last_review"] = (raw_train["last_review"].max() - review_train["last_review"]).dt.days
# impute missingness as its own value
review_train.loc[review_train["review_scores_rating"].isna(), 'review_scores_rating'] = -1
review_train.loc[review_train["review_scores_accuracy"].isna(), 'review_scores_accuracy'] = -1
review_train.loc[review_train["review_scores_cleanliness"].isna(), 'review_scores_cleanliness'] = -1
review_train.loc[review_train["review_scores_checkin"].isna(), 'review_scores_checkin'] = -1
review_train.loc[review_train["review_scores_communication"].isna(), 'review_scores_communication'] = -1
review_train.loc[review_train["review_scores_location"].isna(), 'review_scores_location'] = -1
review_train.loc[review_train["review_scores_value"].isna(), 'review_scores_value'] = -1

review_train["Has Reviews"] = (raw_train["number_of_reviews"] == 0).astype(int)

# join for final data frame
model2_train = pd.concat([good_to_go_train, bools_train, coords_train, dates_train, categorical_train, amenity_train, wrangle_train, review_train], axis=1)

### Testing Dataset

In [None]:
# features that are ready to go out of the box
good_to_go_test = raw_test[['host_total_listings_count', 'calculated_host_listings_count',
                              'accommodates',
                              'availability_30', 'availability_60', 'availability_90', 'availability_365',
                              'minimum_nights', 'maximum_nights',
                              'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d']].copy()

# features that require transformation from boolean to indicator
bools_test = raw_test[['host_is_superhost', "host_has_profile_pic", 'host_identity_verified', 'has_availability', 'instant_bookable']].copy()
bools_test.loc[bools_test["host_is_superhost"].isna(), 'host_is_superhost'] = False
bools_test.loc[bools_test["has_availability"].isna(), 'has_availability'] = False
bools_test = bools_test.astype(int)

# coordinate and rotated coordinate features
# Creating the rotated version such that Manhattan is exactly vertical
# so trees can roughly split along streets and avenues
coords_test = raw_test[['longitude', 'latitude']].copy()

theta = np.radians(32)
rotation_matrix = np.array([
    [np.cos(theta), -np.sin(theta)],
    [np.sin(theta), np.cos(theta)]
])

coords = coords_test[['longitude', 'latitude']].values
rotated_coords = coords @ rotation_matrix.T
coords_test['Rotated Longitude'] = rotated_coords[:, 0]
coords_test['Rotate Latitude'] = rotated_coords[:, 1]

# date variables (transform to "days since")
dates_test = (raw_train["host_since"].max() - raw_test["host_since"]).dt.days

# create dummies for categorical vairables
categorical_test = raw_test[['property_type', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'host_response_time', 'room_type']].copy()
categorical_test.loc[categorical_test["property_type"].apply(lambda x : x not in prop_types_to_dummy), "property_type"] = "Other"
categorical_test.loc[categorical_test["neighbourhood_cleansed"].apply(lambda x : x not in hoods_to_dummy), "neighbourhood_cleansed"] = "Other"
categorical_test = pd.get_dummies(categorical_test, prefix=["Property Type", "Neighborhood", "Neighborhood Group", "Response Time", "Room Type"],
                                   prefix_sep = " : ", dummy_na=True, drop_first=False).drop(columns=["Property Type : nan",
                                                                                                      "Neighborhood : nan",
                                                                                                      "Neighborhood Group : nan",
                                                                                                      "Room Type : nan"]).astype(int)

# Create Indicators for Amenities that appear more than 100 times in the training set
amenities_as_lists = raw_test['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))

amenity_test = pd.DataFrame({f'Amenity : {amenity}': amenities_as_lists.apply(lambda x: amenity in x) for amenity in amenitities_to_dummy}).astype(int)
amenity_test["Total Amenities"] = amenities_as_lists.apply(lambda x : len(x))

# features that require imputation and other wrangling
wrangle_test = raw_test[["host_response_rate", "host_acceptance_rate", "bathrooms", "bedrooms", "beds"]].copy()
# impute missingness as its own value
wrangle_test.loc[wrangle_test["host_response_rate"].isna(), "host_response_rate"] = -1
wrangle_test.loc[wrangle_test["host_acceptance_rate"].isna(), "host_acceptance_rate"] = -1
# impute the mode
wrangle_test.loc[wrangle_test["bathrooms"].isna(), "bathrooms"] = 1
wrangle_test.loc[wrangle_test["bedrooms"].isna(), "bedrooms"] = 1
wrangle_test.loc[wrangle_test["beds"].isna(), "beds"] = 1
# Create ratios from calculated counts fields
wrangle_test["Shared Baths"] = raw_test["bathrooms_text"].apply(lambda x : "shared" in str(x).lower()).astype(int)
wrangle_test["Calculated Host Proportion : Entire Homes/Apts"] = raw_test['calculated_host_listings_count_entire_homes'] / raw_test['calculated_host_listings_count']
wrangle_test["Calculated Host Proportion : Private Rooms"] = raw_test['calculated_host_listings_count_private_rooms'] / raw_test['calculated_host_listings_count']
wrangle_test["Calculated Host Proportion : Shared Rooms"] = raw_test['calculated_host_listings_count_shared_rooms'] / raw_test['calculated_host_listings_count']

# Curate all Review Data
review_test = raw_test[['first_review', 'last_review',
                          'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']].copy()
# impute min/max value and take ""days since"
review_test.loc[review_test["first_review"].isna(), 'first_review'] = raw_train["first_review"].max()
review_test.loc[review_test["last_review"].isna(), 'last_review'] = raw_train["last_review"].min()
review_test["first_review"] = (raw_train["first_review"].max() - review_test["first_review"]).dt.days
review_test["last_review"] = (raw_train["last_review"].max() - review_test["last_review"]).dt.days
# impute missingness as its own value
review_test.loc[review_test["review_scores_rating"].isna(), 'review_scores_rating'] = -1
review_test.loc[review_test["review_scores_accuracy"].isna(), 'review_scores_accuracy'] = -1
review_test.loc[review_test["review_scores_cleanliness"].isna(), 'review_scores_cleanliness'] = -1
review_test.loc[review_test["review_scores_checkin"].isna(), 'review_scores_checkin'] = -1
review_test.loc[review_test["review_scores_communication"].isna(), 'review_scores_communication'] = -1
review_test.loc[review_test["review_scores_location"].isna(), 'review_scores_location'] = -1
review_test.loc[review_test["review_scores_value"].isna(), 'review_scores_value'] = -1

review_test["Has Reviews"] = (raw_test["number_of_reviews"] == 0).astype(int)

# join for final data frame
model2_test = pd.concat([good_to_go_test, bools_test, coords_test, dates_test, categorical_test, amenity_test, wrangle_test, review_test], axis=1)

### Normalization

In [None]:
# Make feature names more readable
model2_train.columns = [x.replace("_", " ") for x in model2_train.columns]
model2_test.columns = [x.replace("_", " ") for x in model2_test.columns]

model2_train_norm = model2_train.copy()
model2_test_norm = model2_test.copy()
# min/max standardization
for col in model2_train.columns:
    model2_train_norm[col] = model2_train_norm[col] - model2_train[col].min()
    model2_train_norm[col] = model2_train_norm[col] / (model2_train[col].max() - model2_train[col].min())

    model2_test_norm[col] = model2_test_norm[col] - model2_train[col].min()
    model2_test_norm[col] = model2_test_norm[col] / (model2_train[col].max() - model2_train[col].min())
# export
model2_train_norm.to_csv('data/model2_training_features.csv', index=False)
model2_test_norm.to_csv('data/model2_testing_features.csv', index=False)

## Model 3 Feature Engineering

### Computing Influential N-Grams

In [None]:
# write a function to replace punctuation with white space
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    return text

In [None]:
clean_descriptions = raw_train['description'].fillna("")
clean_reviews = raw_train['reviews'].fillna("")
# Clean all textual data
clean_names = raw_train['name'].apply(clean_text)
clean_descriptions = clean_descriptions.apply(clean_text)
clean_reviews = clean_reviews.apply(clean_text).apply(lambda x : re.sub(r'\n', '', x))
# generate indicators for the occurrence of frequent n-grams
name_vectorizer = CountVectorizer(ngram_range=(1, 5), min_df=50)
description_vectorizer = CountVectorizer(ngram_range=(2, 6), min_df=150)
review_vectorizer = CountVectorizer(ngram_range=(3, 3), min_df=350)
# Create data frames out of the indicators
name_ngrams = pd.DataFrame(name_vectorizer.fit_transform(clean_names).toarray(),
                           columns = name_vectorizer.get_feature_names_out())
description_ngrams = pd.DataFrame(description_vectorizer.fit_transform(clean_descriptions).toarray(),
                                  columns = description_vectorizer.get_feature_names_out())
review_ngrams = pd.DataFrame(review_vectorizer.fit_transform(clean_reviews).toarray(),
                                  columns = review_vectorizer.get_feature_names_out())

In [None]:
# Compute approximate mutual information of these indicators with the response variable
name_ngram_mututal_information = mutual_info_regression(name_ngrams, raw_train['price'])
description_ngram_mututal_information = mutual_info_regression(description_ngrams, raw_train['price'])
review_ngram_mututal_information = mutual_info_regression(review_ngrams, raw_train['price'])

In [None]:
# Take n-gram indicators with the most mutual information with the response variable
name_top50 = [feature for _, feature in sorted(zip(name_ngram_mututal_information, name_ngrams.columns), reverse=True)[:50]]
description_top100 = [feature for _, feature in sorted(zip(description_ngram_mututal_information, description_ngrams.columns), reverse=True)[:100]]
review_top100 = [feature for _, feature in sorted(zip(review_ngram_mututal_information, review_ngrams.columns), reverse=True)[:100]]

### Creating Textual Training/Testing Data

In [None]:
clean_descriptions_test = raw_test['description'].fillna("")
clean_reviews_test = raw_test['reviews'].fillna("")

clean_names_test = raw_test['name'].apply(clean_text)
clean_descriptions_test = clean_descriptions_test.apply(clean_text)
clean_reviews_test = clean_reviews_test.apply(clean_text).apply(lambda x : re.sub(r'\n', '', x))

# Recreate transformers for chosen n-grams 
name_transformer = CountVectorizer(ngram_range=(1, 5), vocabulary=name_top50)
description_transformer = CountVectorizer(ngram_range=(2, 6), vocabulary=description_top100)
review_transformer = CountVectorizer(ngram_range=(3, 3), vocabulary=review_top100)
# Create testing and training indicators
name_ngrams_train = pd.DataFrame(name_transformer.transform(clean_names).toarray(),
                                 columns = [f'Name NGram : {gram}' for gram in name_transformer.get_feature_names_out()])
description_ngrams_train = pd.DataFrame(description_transformer.transform(clean_descriptions).toarray(),
                                        columns = [f'Description NGram : {gram}' for gram in description_transformer.get_feature_names_out()])
review_ngrams_train = pd.DataFrame(review_transformer.transform(clean_reviews).toarray(),
                                        columns = [f'Review NGram : {gram}' for gram in review_transformer.get_feature_names_out()])

name_ngrams_test = pd.DataFrame(name_transformer.transform(clean_names_test).toarray(),
                                columns = [f'Name NGram : {gram}' for gram in name_transformer.get_feature_names_out()])
description_ngrams_test = pd.DataFrame(description_transformer.transform(clean_descriptions_test).toarray(),
                                       columns = [f'Description NGram : {gram}' for gram in description_transformer.get_feature_names_out()])
review_ngrams_test = pd.DataFrame(review_transformer.transform(clean_reviews_test).toarray(),
                                        columns = [f'Review NGram : {gram}' for gram in review_transformer.get_feature_names_out()])

### Normalization

In [None]:
# Join with existing model 2 data
model3_train = pd.concat([model2_train, name_ngrams_train, description_ngrams_train, review_ngrams_train], axis=1)
model3_test = pd.concat([model2_test, name_ngrams_test, description_ngrams_test, review_ngrams_test], axis=1)

model3_train_norm = model3_train.copy()
model3_test_norm = model3_test.copy()
# Mean/std normalization
for col in model2_train.columns:
    model3_train_norm[col] = model3_train_norm[col] - model3_train[col].mean()
    model3_train_norm[col] = model3_train_norm[col] / model3_train[col].std()

    model3_test_norm[col] = model3_test_norm[col] - model3_train[col].mean()
    model3_test_norm[col] = model3_test_norm[col] / model3_train[col].std()
# Export
model3_train_norm.to_csv('data/model3_training_features.csv', index=False)
model3_test_norm.to_csv('data/model3_testing_features.csv', index=False)