In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/MyDrive/homestay_price/Homestays_Data.csv'
df = pd.read_csv(path)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('log_price', axis=1)
y = df['log_price']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
X_train

In [None]:
from scipy.spatial.distance import cdist

def impute_and_target_encode(X_train, y_train, X_test, k=20):
    """
    Imputes missing neighborhoods using the nearest neighbor (Manhattan distance)
    and then applies a smoothed (credibility-based) target encoding.

    Args:
        X_train (pd.DataFrame): Training features with 'latitude', 'longitude', 'neighbourhood'.
        y_train (pd.Series): Training target (e.g., 'log_price').
        X_test (pd.DataFrame): Test features with 'latitude', 'longitude', 'neighbourhood'.
        k (int): The smoothing factor for credibility encoding. Higher values
                 require more data for a category's mean to be trusted.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    train_known_hoods = X_train_processed[X_train_processed['neighbourhood'].notna()]

    # Impute for training set
    train_missing_mask = X_train_processed['neighbourhood'].isna()
    if train_missing_mask.any():
        coords_missing = X_train_processed.loc[train_missing_mask, ['latitude', 'longitude']].values
        coords_known = train_known_hoods[['latitude', 'longitude']].values
        distance_matrix = cdist(coords_missing, coords_known, metric='cityblock')
        nearest_indices = np.argmin(distance_matrix, axis=1)
        imputed_hoods = train_known_hoods['neighbourhood'].iloc[nearest_indices].values
        X_train_processed.loc[train_missing_mask, 'neighbourhood'] = imputed_hoods

    # Impute for test set using training data as reference
    test_missing_mask = X_test_processed['neighbourhood'].isna()
    if test_missing_mask.any():
        coords_missing = X_test_processed.loc[test_missing_mask, ['latitude', 'longitude']].values
        coords_known = train_known_hoods[['latitude', 'longitude']].values
        distance_matrix = cdist(coords_missing, coords_known, metric='cityblock')
        nearest_indices = np.argmin(distance_matrix, axis=1)
        imputed_hoods = train_known_hoods['neighbourhood'].iloc[nearest_indices].values
        X_test_processed.loc[test_missing_mask, 'neighbourhood'] = imputed_hoods

    print("Imputation complete.")

    # Combine X_train and y_train for calculations
    train_full = pd.concat([X_train_processed, y_train], axis=1)
    target_name = y_train.name

    # Calculate global mean
    global_mean = y_train.mean()

    # Calculate mean and count for each neighborhood
    agg = train_full.groupby('neighbourhood')[target_name].agg(['mean', 'count'])

    # Calculate the credibility weight (w) and the smoothed mean
    local_mean = agg['mean']
    n = agg['count']

    weight = n / (n + k)

    smoothed_mean = weight * local_mean + (1 - weight) * global_mean

    # This is our new encoding map
    encoding_map = smoothed_mean

    print(f"Global Mean log_price: {global_mean:.4f}")
    print(f"K (smoothing factor): {k}\n")
    print("--- Encoding Map (Neighborhood -> Smoothed Mean) ---")
    print(encoding_map)
    print("-" * 50)


    # Add the new encoded feature
    X_train_processed['neighbourhood_encoded'] = X_train_processed['neighbourhood'].map(encoding_map)
    X_test_processed['neighbourhood_encoded'] = X_test_processed['neighbourhood'].map(encoding_map)

    # Fill any potential NaNs in the test set with the global mean
    # This handles neighborhoods that are in test but not train
    X_test_processed['neighbourhood_encoded'].fillna(global_mean, inplace=True)

    print("Smoothed Target Encoding complete.")

    return X_train_processed, X_test_processed

X_train, X_test = impute_and_target_encode(X_train, y_train, X_test)

print("\n--- Original X_train ---\n", X_train)
print("\n--- Transformed X_train ---\n", X_train)
print("\n--- Original X_test ---\n", X_test)
print("\n--- Transformed X_test ---\n", X_test)

In [None]:
X_train.isnull().sum()

In [None]:
correlation = X_train['neighbourhood_encoded'].corr(y_train)

print(f"Correlation between neighbourhood_encoded and y_train: {correlation:.4f}")

In [None]:
def impute_with_median_and_indicator(X_train, X_test, columns):
    """
    Imputes specified columns with the median calculated from the training set
    and adds binary indicator columns to flag imputed values.

    Args:
        X_train (pd.DataFrame): Training features DataFrame.
        X_test (pd.DataFrame): Test features DataFrame.
        columns (list): A list of column names to impute.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames.
    """
    X_train_imputed = X_train.copy()
    X_test_imputed = X_test.copy()

    print("Starting Median Imputation")
    for col in columns:
        median_val = X_train_imputed[col].median()
        print(f"Median for '{col}': {median_val}")

        X_train_imputed[f'{col}_is_missing'] = X_train_imputed[col].isna().astype(int)
        X_test_imputed[f'{col}_is_missing'] = X_test_imputed[col].isna().astype(int)

        X_train_imputed[col].fillna(median_val, inplace=True)
        X_test_imputed[col].fillna(median_val, inplace=True)

    print("Median imputation complete.")
    return X_train_imputed, X_test_imputed

In [None]:
def convert_response_rate_to_numeric(df):
    """
    Converts the 'host_response_rate' column to numeric by removing '%'
    and coercing errors to NaN.
    """
    df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float') / 100.0
    return df

X_train = convert_response_rate_to_numeric(X_train)
X_test = convert_response_rate_to_numeric(X_test)

In [None]:
impute_cols = ['bathrooms', 'bedrooms', 'beds', 'host_response_rate']
X_train, X_test = impute_with_median_and_indicator(
    X_train, X_test, impute_cols
)

In [None]:
X_train

In [None]:
pd.set_option('display.max_columns', None)
display(X_train.head())

In [None]:
cols_to_drop = ['id', 'thumbnail_url', 'zipcode']
X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=cols_to_drop)

print("Columns dropped successfully.")
display(X_train.head())

In [None]:
print("Distribution of 'number_of_reviews':")
display(X_train['number_of_reviews'].describe())

plt.figure(figsize=(10, 5))
sns.histplot(X_train['number_of_reviews'].dropna(), bins=50, kde=True)
plt.title('Distribution of Number of Reviews')
plt.xlabel('Number of Reviews')
plt.ylabel('Frequency')
plt.show()

print("\nDistribution of 'review_scores_rating':")
display(X_train['review_scores_rating'].describe())

plt.figure(figsize=(10, 5))
sns.histplot(X_train['review_scores_rating'].dropna(), bins=20, kde=True)
plt.title('Distribution of Review Scores Rating')
plt.xlabel('Review Scores Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
import pandas as pd
import numpy as np

def calculate_bayesian_average_score(X_train, X_test, C=20):
    """
    Calculates a credibility score using a Bayesian average.

    This combines an item's average rating with the global average rating,
    weighted by the number of reviews.

    Args:
        X_train (pd.DataFrame): Training features with 'review_scores_rating'
                                and 'number_of_reviews'.
        X_test (pd.DataFrame): Test features with 'review_scores_rating'
                               and 'number_of_reviews'.
        C (int): The smoothing factor or "prior count". A higher value
                 means a listing needs more reviews for its own rating
                 to be considered credible.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames
               with the new 'credibility_score' column.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    global_avg_rating = X_train_processed['review_scores_rating'].mean()
    print(f"Global Average Rating (from train set): {global_avg_rating:.2f}")
    print(f"Smoothing Factor (C): {C}")

    X_train_processed['review_scores_rating'].fillna(global_avg_rating, inplace=True)
    X_test_processed['review_scores_rating'].fillna(global_avg_rating, inplace=True)

    def calculate_score(row):
        n = row['number_of_reviews']
        item_rating = row['review_scores_rating']

        # The core Bayesian average formula
        numerator = (C * global_avg_rating) + (n * item_rating)
        denominator = C + n

        return numerator / denominator

    X_train_processed['credibility_score'] = X_train_processed.apply(calculate_score, axis=1)
    X_test_processed['credibility_score'] = X_test_processed.apply(calculate_score, axis=1)

    print("'credibility_score' feature created successfully.")

    return X_train_processed, X_test_processed

X_train, X_test = calculate_bayesian_average_score(
    X_train, X_test, C=20
)

In [None]:
X_train.head(3)

In [None]:
X_train.info()

In [None]:
for col in ['host_has_profile_pic', 'host_identity_verified']:
    X_train[col] = X_train[col].map({'t': 1, 'f': 0})
    X_test[col] = X_test[col].map({'t': 1, 'f': 0})

print("Replaced 't' with 1 and 'f' with 0 for 'host_has_profile_pic' and 'host_identity_verified'.")
display(X_train[['host_has_profile_pic', 'host_identity_verified']].head())

In [None]:
for col in ['host_has_profile_pic', 'host_identity_verified']:
    X_train[col].fillna(0, inplace=True)
    X_test[col].fillna(0, inplace=True)

print("Imputed missing values with 0 for 'host_has_profile_pic' and 'host_identity_verified'.")
display(X_train[['host_has_profile_pic', 'host_identity_verified']].isnull().sum())
display(X_test[['host_has_profile_pic', 'host_identity_verified']].isnull().sum())

In [None]:
X_train.info()

In [None]:
X_train['last_review'] = pd.to_datetime(X_train['last_review'], errors='coerce')
X_test['last_review'] = pd.to_datetime(X_test['last_review'], errors='coerce')

fixed_date = pd.to_datetime('2024-01-01')

X_train['relevance'] = (fixed_date - X_train['last_review']).dt.days
X_test['relevance'] = (fixed_date - X_test['last_review']).dt.days

print("Created 'relevance' feature.")
display(X_train[['last_review', 'relevance']].head())

In [None]:
X_train['last_review'] = pd.to_datetime(X_train['last_review'], errors='coerce')
X_train['first_review'] = pd.to_datetime(X_train['first_review'], errors='coerce')
X_test['last_review'] = pd.to_datetime(X_test['last_review'], errors='coerce')
X_test['first_review'] = pd.to_datetime(X_test['first_review'], errors='coerce')

X_train['active_period'] = (X_train['last_review'] - X_train['first_review']).dt.days
X_test['active_period'] = (X_test['last_review'] - X_test['first_review']).dt.days

print("Created 'active_period' feature.")
display(X_train[['first_review', 'last_review', 'active_period']].head())

In [None]:
X_train = X_train.drop(columns=['last_review'])
X_test = X_test.drop(columns=['last_review'])

imputation_value = 14608
X_train['relevance'].fillna(imputation_value, inplace=True)
X_test['relevance'].fillna(imputation_value, inplace=True)

print("Removed 'last_review' and imputed 'relevance'.")
display(X_train[['relevance']].isnull().sum())
display(X_test[['relevance']].isnull().sum())

In [None]:
X_train['active_period'].fillna(0, inplace=True)
X_test['active_period'].fillna(0, inplace=True)

print("Imputed missing values with 0 for 'active_period'.")
display(X_train['active_period'].isnull().sum())
display(X_test['active_period'].isnull().sum())

In [None]:
X_train.info()

In [None]:
X_train['host_since'] = pd.to_datetime(X_train['host_since'], errors='coerce')
X_test['host_since'] = pd.to_datetime(X_test['host_since'], errors='coerce')

fixed_date = pd.to_datetime('2024-01-01')

# Calculate host tenure in days
X_train['host_tenure'] = (fixed_date - X_train['host_since']).dt.days
X_test['host_tenure'] = (fixed_date - X_test['host_since']).dt.days

# Impute missing 'host_tenure' with the median from the training set and add indicator
host_tenure_median = X_train['host_tenure'].median()
X_train['host_tenure_is_missing'] = X_train['host_tenure'].isna().astype(int)
X_test['host_tenure_is_missing'] = X_test['host_tenure'].isna().astype(int)
X_train['host_tenure'].fillna(host_tenure_median, inplace=True)
X_test['host_tenure'].fillna(host_tenure_median, inplace=True)


print("Created 'host_tenure' feature and imputed missing values.")
display(X_train[['host_since', 'host_tenure', 'host_tenure_is_missing']].head())
display(X_train['host_tenure'].isnull().sum())
display(X_test['host_tenure'].isnull().sum())

In [None]:
X_train.info()

In [None]:
X_train = X_train.drop(columns=['first_review', 'host_since'])
X_test = X_test.drop(columns=['first_review', 'host_since'])

print("Removed 'first_review' and 'host_since' columns.")
display(X_train.head())

In [None]:
print("Missing values in X_train:")
display(X_train.isnull().sum())

print("\nMissing values in X_test:")
display(X_test.isnull().sum())

In [None]:
X_train.head(5)

In [None]:
categorical_cols_to_plot = ['property_type', 'room_type', 'bed_type', 'city']

for col in categorical_cols_to_plot:
    plt.figure(figsize=(12, 6))
    sns.countplot(data=X_train, y=col, order=X_train[col].value_counts().index, palette='viridis')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.show()

In [None]:
def bayesian_target_encode(X_train, y_train, X_test, column_name, k=20):
    """
    Applies a Bayesian (smoothed) target encoding to a specified column.

    This method blends the category's local average target value with the
    global average, weighted by the category's size (credibility).

    Args:
        X_train (pd.DataFrame): Training features DataFrame.
        y_train (pd.Series): Training target Series (e.g., 'log_price').
        X_test (pd.DataFrame): Test features DataFrame.
        column_name (str): The name of the categorical column to encode
                           (e.g., 'property_type').
        k (int): The smoothing factor. A higher value requires more data for a
                 category's mean to be trusted.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames
               with the new encoded column.
    """
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

    print(f"--- Starting Bayesian Target Encoding for '{column_name}' ---")

    train_full = pd.concat([X_train_encoded, y_train], axis=1)
    target_name = y_train.name

    global_mean = y_train.mean()
    print(f"Global Mean ('{target_name}'): {global_mean:.4f}")
    print(f"Smoothing Factor (k): {k}")

    agg = train_full.groupby(column_name)[target_name].agg(['mean', 'count'])

    weight = agg['count'] / (agg['count'] + k)

    smoothed_mean = weight * agg['mean'] + (1 - weight) * global_mean

    encoding_map = smoothed_mean

    new_col_name = f'{column_name}_encoded'
    X_train_encoded[new_col_name] = X_train_encoded[column_name].map(encoding_map)
    X_test_encoded[new_col_name] = X_test_encoded[column_name].map(encoding_map)

    X_test_encoded[new_col_name].fillna(global_mean, inplace=True)

    print(f"Bayesian encoding complete. New column created: '{new_col_name}'")

    return X_train_encoded, X_test_encoded

In [None]:
X_train, X_test = bayesian_target_encode(
    X_train, y_train, X_test,
    column_name='property_type', k=10
)

In [None]:
X_train.head(5)

In [None]:
X_train['cleaning_fee'] = X_train['cleaning_fee'].map({False: 0, True: 1})
X_test['cleaning_fee'] = X_test['cleaning_fee'].map({False: 0, True: 1})

print("Converted 'cleaning_fee' to numeric.")
display(X_train[['cleaning_fee']].head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def apply_tfidf_to_name(X_train, X_test, max_features=100):
    """
    Applies TF-IDF vectorization to the 'name' column of the datasets.

    Args:
        X_train (pd.DataFrame): Training features DataFrame with a 'name' column.
        X_test (pd.DataFrame): Test features DataFrame with a 'name' column.
        max_features (int): The maximum number of top TF-IDF features to create.
                            This helps control the dimensionality of the output.

    Returns:
        tuple: A tuple containing:
               - X_train_processed (pd.DataFrame): The training DataFrame with new TF-IDF features.
               - X_test_processed (pd.DataFrame): The test DataFrame with new TF-IDF features.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    print("Applying TF-IDF to 'name' column")

    X_train_processed['name'].fillna('missing', inplace=True)
    X_test_processed['name'].fillna('missing', inplace=True)

    # Initialize the TfidfVectorizer
    # stop_words='english': Removes common English words (like 'a', 'the', 'in').
    # max_features: Limits the number of output columns to the top N most important words.
    # ngram_range=(1, 2): Considers both single words (unigrams) and two-word phrases (bigrams).
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=max_features,
        ngram_range=(1, 2)
    )

    X_train_tfidf = vectorizer.fit_transform(X_train_processed['name'])

    # Transform the test data using the already-fitted vectorizer
    # This ensures that the same vocabulary and weights are applied.
    X_test_tfidf = vectorizer.transform(X_test_processed['name'])

    tfidf_train_df = pd.DataFrame(
        X_train_tfidf.toarray(),
        columns=[f'tfidf_{word}' for word in vectorizer.get_feature_names_out()],
        index=X_train_processed.index
    )

    tfidf_test_df = pd.DataFrame(
        X_test_tfidf.toarray(),
        columns=[f'tfidf_{word}' for word in vectorizer.get_feature_names_out()],
        index=X_test_processed.index
    )

    X_train_processed = pd.concat([X_train_processed.drop(columns=['name']), tfidf_train_df], axis=1)
    X_test_processed = pd.concat([X_test_processed.drop(columns=['name']), tfidf_test_df], axis=1)

    print(f"TF-IDF complete. Added {len(vectorizer.get_feature_names_out())} new features.")

    return X_train_processed, X_test_processed

In [None]:
X_train, X_test = apply_tfidf_to_name(X_train, X_test, max_features=10)

In [None]:
X_train.head(5)

In [None]:
X_train['instant_bookable'] = X_train['instant_bookable'].map({'f': 0, 't': 1})
X_test['instant_bookable'] = X_test['instant_bookable'].map({'f': 0, 't': 1})

print("Converted 'instant_bookable' to numeric.")
display(X_train[['instant_bookable']].head())

In [None]:
X_train.info()

In [None]:
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def apply_vader_sentiment(X_train, X_test, column_name):
    """
    Applies VADER sentiment analysis to a specified text column and adds
    the sentiment scores as new features.

    Args:
        X_train (pd.DataFrame): Training features DataFrame.
        X_test (pd.DataFrame): Test features DataFrame.
        column_name (str): The name of the text column to analyze (e.g., 'description').

    Returns:
        tuple: A tuple containing:
               - X_train_processed (pd.DataFrame): The training DataFrame with new sentiment features.
               - X_test_processed (pd.DataFrame): The test DataFrame with new sentiment features.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    print(f"--- Applying VADER Sentiment Analysis to '{column_name}' column ---")

    analyzer = SentimentIntensityAnalyzer()

    X_train_processed[column_name].fillna('', inplace=True)
    X_test_processed[column_name].fillna('', inplace=True)

    def get_sentiment_scores(text):
        # The polarity_scores method returns a dictionary with neg, neu, pos, and compound scores
        scores = analyzer.polarity_scores(text)
        return pd.Series(scores)

    train_sentiments = X_train_processed[column_name].apply(get_sentiment_scores)
    train_sentiments.columns = [f'vader_{col}_{column_name}' for col in train_sentiments.columns]
    X_train_processed = pd.concat([X_train_processed, train_sentiments], axis=1)

    test_sentiments = X_test_processed[column_name].apply(get_sentiment_scores)
    test_sentiments.columns = [f'vader_{col}_{column_name}' for col in test_sentiments.columns]
    X_test_processed = pd.concat([X_test_processed, test_sentiments], axis=1)

    print(f"✅ VADER analysis complete. Added 4 new sentiment features.")

    return X_train_processed, X_test_processed

In [None]:
X_train, X_test = apply_vader_sentiment(
    X_train, X_test, column_name='description'
)

In [None]:
X_train.head(5)

In [None]:
X_train = X_train.drop(columns=['description'])
X_test = X_test.drop(columns=['description'])

print("Removed 'description' column.")
display(X_train.head())

In [None]:
# Function to clean the amenities list
def clean_amenities(amenities):
    # Remove the curly braces and quotes
    cleaned_amenities = amenities.replace('{', '').replace('}', '').replace('"', '')
    return cleaned_amenities

# Apply the cleaning function to the amenities column
X_train['cleaned_amenities'] = X_train['amenities'].apply(clean_amenities)
X_test['cleaned_amenities'] = X_test['amenities'].apply(clean_amenities)

In [None]:
X_train.head(5)

In [None]:
X_train = X_train.drop(columns=['amenities'])
X_test = X_test.drop(columns=['amenities'])

print("Removed 'amenities' column.")
display(X_train.head())

In [None]:
X_train = X_train.rename(columns={'cleaned_amenities': 'amenities'})
X_test = X_test.rename(columns={'cleaned_amenities': 'amenities'})

print("Renamed 'cleaned_amenities' to 'amenities'.")
display(X_train.head())

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def cluster_amenities_with_bert(X_train, X_test, max_k=15):
    """
    Generates BERT embeddings for the 'amenities' column, finds the optimal
    number of clusters using the silhouette method, and adds cluster labels
    as a new feature.

    Args:
        X_train (pd.DataFrame): Training features with an 'amenities' column.
        X_test (pd.DataFrame): Test features with an 'amenities' column.
        max_k (int): The maximum number of clusters to test for optimality.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames
               with the new 'amenity_cluster' feature.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    model = SentenceTransformer('all-MiniLM-L6-v2')

    train_amenities = X_train_processed['amenities'].fillna('').tolist()
    test_amenities = X_test_processed['amenities'].fillna('').tolist()

    train_embeddings = model.encode(train_amenities, show_progress_bar=True)

    print(f"\nFinding optimal k (up to max_k={max_k})")
    silhouette_scores = {}
    k_range = range(2, max_k + 1)

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(train_embeddings)
        score = silhouette_score(train_embeddings, kmeans.labels_)
        silhouette_scores[k] = score
        print(f"  - k={k}, Silhouette Score: {score:.4f}")

    if not silhouette_scores:
        print("Could not determine optimal k. Defaulting to 3.")
        optimal_k = 3
    else:
        optimal_k = max(silhouette_scores, key=silhouette_scores.get)

    print(f"\nOptimal number of clusters found: k = {optimal_k}")

    print(f"Training final K-Means model with k={optimal_k}")
    final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    final_kmeans.fit(train_embeddings)

    X_train_processed['amenity_cluster'] = final_kmeans.labels_

    print("Generating embeddings for test data and predicting clusters")
    test_embeddings = model.encode(test_amenities, show_progress_bar=True)
    test_clusters = final_kmeans.predict(test_embeddings)
    X_test_processed['amenity_cluster'] = test_clusters

    print("\n Amenity clustering complete.")
    return X_train_processed, X_test_processed

X_train, X_test = cluster_amenities_with_bert(X_train, X_test, max_k=5)

In [None]:
from scipy.stats import f_oneway
from collections import defaultdict

def get_top_amenities_by_anova(X_train, y_train, top_n=10):
    """
    Identifies the top N most influential amenities on price using an ANOVA F-test,
    including p-values for significance.

    Args:
        X_train (pd.DataFrame): Training features with an 'amenities' column.
        y_train (pd.Series): Training target (e.g., 'log_price').
        top_n (int): The number of top amenities to return.

    Returns:
        pd.DataFrame: A DataFrame with columns for 'f_statistic' and 'p_value',
                      indexed by amenity name, sorted by F-statistic.
    """
    print("--- Finding Top Amenities using ANOVA ---")

    train_full = pd.concat([X_train, y_train], axis=1)
    target_name = y_train.name

    amenities_series = train_full['amenities'].fillna('').str.lower().str.strip()
    amenities_df = amenities_series.str.get_dummies(sep=',')

    anova_results = {}

    for amenity in amenities_df.columns:
        group_with_amenity = train_full[amenities_df[amenity] == 1][target_name]
        group_without_amenity = train_full[amenities_df[amenity] == 0][target_name]

        if len(group_with_amenity) > 1 and len(group_without_amenity) > 1:
            f_stat, p_value = f_oneway(group_with_amenity, group_without_amenity)
            anova_results[amenity] = {'f_statistic': f_stat, 'p_value': p_value}

    if not anova_results:
        return pd.DataFrame(columns=['f_statistic', 'p_value'])

    results_df = pd.DataFrame.from_dict(anova_results, orient='index')
    results_df = results_df.sort_values(by='f_statistic', ascending=False)

    print(f"ANOVA complete. Found {len(results_df)} testable amenities.")

    return results_df.head(top_n)

In [None]:
all_ranked_amenities = get_top_amenities_by_anova(X_train, y_train, top_n=50)

print("\n Top 50 Most Influential Amenities (by F-statistic)")
print(all_ranked_amenities)

significance_level = 0.05
significant_amenities = all_ranked_amenities[all_ranked_amenities['p_value'] < significance_level]

print(f"\n Statistically Significant Amenities (p < {significance_level})")
if significant_amenities.empty:
    print("No statistically significant amenities found at this level.")
else:
    print(significant_amenities)

In [None]:
# Create a mapping dictionary
amenities_mapping = {
    "TV": ["Cable TV", "TV"],
    "Smart lock": ["Smart lock", "Smartlock"],
    "Doorman": ["Doorman", "Doorman Entry"],
    "Firm mattress": ["Firm matress", "Firm mattress"],
    "Elevator": ["Elevator", "Elevator in building"],
    "Grab-rails for shower and toilet": ["Grab-rails for shower and toilet", "Fixed grab bars for shower & toilet"],
    "Wide clearance to shower and toilet": ["Wide clearance to shower & toilet", "Wide clearance to shower and toilet"],
    "Washer": ["Washer", "Washer / Dryer"],
    "Dryer": ["Dryer", "Washer / Dryer"],
    "Wide clearance": ["Wide clearance to bed", "Wide doorway", "Wide entryway", "Wide hallway clearance"],
    "Internet": ["Wireless Internet", "Internet", "Ethernet connection", "Pocket wifi"],
    "Flat smooth pathway to front door": ["Flat smooth pathway to front door", "smooth pathway to front door"]
}

# Function to map amenities to their unified categories
def map_amenities(amenities_list):
    amenities_list = amenities_list.split(',')
    mapped_amenities = []
    for amenity in amenities_list:
        amenity = amenity.strip()
        for key, values in amenities_mapping.items():
            if amenity in values:
                mapped_amenities.append(key)
                break
        else:
            mapped_amenities.append(amenity)
    return ', '.join(sorted(set(mapped_amenities)))

X_train['amenities'] = X_train['amenities'].apply(map_amenities)
X_test['amenities'] = X_test['amenities'].apply(map_amenities)

X_train['amenities_list'] = X_train['amenities'].apply(lambda x: x.split(','))
X_test['amenities_list'] = X_test['amenities'].apply(lambda x: x.split(','))

all_amenities = [item.strip() for sublist in X_train['amenities_list'] for item in sublist]

unique_amenities = sorted(set(all_amenities))

for amenity in unique_amenities:
    print(amenity)

In [None]:
X_train.head(5)

In [None]:
def encode_amenities_by_category(X_train, X_test, categories):
    """
    Encodes a comma-separated amenities string into new columns representing
    the count of amenities per user-defined category.

    Args:
        X_train (pd.DataFrame): Training features with an 'amenities' column.
        X_test (pd.DataFrame): Test features with an 'amenities' column.
        categories (dict): A dictionary where keys are category names and
                           values are lists of amenities in that category.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    print("--- Encoding Amenities into Categories ---")

    amenity_to_category_map = {}
    for category, amenities_list in categories.items():
        for amenity in amenities_list:
            amenity_to_category_map[amenity.lower().strip()] = category

    def count_amenities_in_categories(amenities_str):
        """Helper function to process one row's amenities string."""
        category_counts = {category: 0 for category in categories.keys()}

        if not isinstance(amenities_str, str) or amenities_str == '':
            return pd.Series(category_counts)

        amenities_list = amenities_str.split(',')

        for amenity in amenities_list:
            cleaned_amenity = amenity.lower().strip()

            if cleaned_amenity in amenity_to_category_map:
                category = amenity_to_category_map[cleaned_amenity]
                category_counts[category] += 1

        return pd.Series(category_counts)

    train_counts_df = X_train_processed['amenities'].apply(count_amenities_in_categories)
    test_counts_df = X_test_processed['amenities'].apply(count_amenities_in_categories)

    X_train_processed = pd.concat([X_train_processed, train_counts_df], axis=1)
    X_test_processed = pd.concat([X_test_processed, test_counts_df], axis=1)

    print(f"Amenity encoding complete. Added {len(categories)} new feature columns.")

    return X_train_processed, X_test_processed

In [None]:
categories = categories = {
    "Safety & Accessibility": [
        "24-hour check-in", "Carbon monoxide detector", "Disabled parking spot", "Doorman",
        "Fire extinguisher", "First aid kit", "Safety card", "Smoke detector", "Stair gates",
        "Step-free access", "Well-lit path to entrance", "Wheelchair accessible",
        "Flat smooth pathway to front door", "Accessible-height bed", "Accessible-height toilet",
        "Wide clearance", "Wide clearance to shower and toilet", "Path to entrance lit at night"
    ],
    "Entertainment & Electronics": [
        "TV", "Game console", "Internet", "Laptop friendly workspace"
    ],
    "Basic Amenities": [
        "Air conditioning", "Heating", "Essentials", "Bed linens",
        "Extra pillows and blankets", "Iron", "Hangers", "Firm mattress", "Hot water kettle"
    ],
    "Family & Kid-Friendly": [
        "Babysitter recommendations", "Children’s books and toys", "Children’s dinnerware",
        "Crib", "High chair", "Pack ’n Play/travel crib", "Outlet covers",
        "Baby bath", "Baby monitor", "Table corner guards", "Family/kid friendly",
        "Changing table"
    ],
    "Kitchen & Dining": [
        "Coffee maker", "Cooking basics", "Dishes and silverware", "Dishwasher",
        "Microwave", "Oven", "Refrigerator", "Stove", "Water kettle", "BBQ grill", "Kitchen",
        "Dryer", "Washer", "Breakfast"
    ],
    "Health & Fitness": [
        "Gym", "Air purifier"
    ],
    "Pets": [
        "Cat(s)", "Dog(s)", "Other pet(s)", "Pets allowed", "Pets live on this property"
    ],
    "Parking": [
        "Free parking on premises", "Free parking on street", "EV charger", "Paid parking off premises"
    ],
    "Outdoor & Leisure": [
        "Beach essentials", "Beachfront", "Garden or backyard", "Lake access",
        "Patio or balcony", "Pool", "Ski in/Ski out", "Hot tub", "Waterfront"
    ],
    "Security & Entry": [
        "Smart lock", "Keypad", "Lock on bedroom door", "Lockbox",
        "Self Check-In", "Host greets you"
    ],
    "Bathroom Amenities": [
        "Hand soap", "Shampoo", "Toilet paper", "Bath towel", "Bathtub",
        "Bathtub with shower chair", "Handheld shower head", "Roll-in shower with chair",
        "Shower chair", "Fixed grab bars for shower & toilet", "Hair dryer", "Hot water",
        "Accessible-height toilet", "Body soap", "Grab-rails for shower and toilet",
        "Hand or paper towel"
    ],
    "Special Features": [
        "Fireplace guards", "Indoor fireplace", "Private bathroom", "Private entrance",
        "Private living room", "Single level home", "Suitable for events",
        "Long term stays allowed", "Luggage dropoff allowed", "Smoking allowed",
        "Other", "Elevator"
    ],
    "Miscellaneous": [
        "Room-darkening shades", "Window guards",
        "Smooth pathway to front door", "Cleaning before checkout",
        "Buzzer/wireless intercom", "translation missing: en.hosting_amenity_50",
        "translation missing: en.hosting_amenity_49", "Ground floor access"
    ],
    "Uncategorized": []
}

In [None]:
X_train, X_test = encode_amenities_by_category(
    X_train, X_test, categories
)

In [None]:
X_train.head(5)

In [None]:
def create_individual_amenity_flags(X_train, X_test, individual_amenities):
    """
    Creates new binary (1/0) columns for a specified list of individual amenities.

    Args:
        X_train (pd.DataFrame): Training features with an 'amenities' column.
        X_test (pd.DataFrame): Test features with an 'amenities' column.
        individual_amenities (list): A list of specific amenities to create
                                     binary flag columns for.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    print("\n Creating Binary Features for Individual Amenities")

    train_amenities_lower = X_train_processed['amenities'].str.lower().fillna('')
    test_amenities_lower = X_test_processed['amenities'].str.lower().fillna('')

    for amenity in individual_amenities:
        clean_amenity_name = amenity.replace(" ", "_").replace("/", "_").replace("-", "_")
        col_name = f'has_{clean_amenity_name}'

        # Use str.contains to check for the presence of the amenity
        # The regex `\b` ensures we match whole words only, e.g., 'tv' doesn't match 'cable tv'
        search_term = r'\b' + amenity.lower() + r'\b'
        X_train_processed[col_name] = train_amenities_lower.str.contains(search_term, regex=True).astype(int)
        X_test_processed[col_name] = test_amenities_lower.str.contains(search_term, regex=True).astype(int)

    print(f"✅ Added {len(individual_amenities)} new binary feature columns.")
    return X_train_processed, X_test_processed

individual_amenities_list = [
    "family/kid friendly", "tv", "cable tv", "dryer", "indoor fireplace",
    "washer", "lock on bedroom door", "doorman", "hair dryer",
    "suitable for events", "gym", "private entrance", "24-hour check-in",
    "heating", "elevator"
]
X_train, X_test = create_individual_amenity_flags(X_train, X_test, individual_amenities_list)

In [None]:
X_train.info()

In [None]:
cols_to_remove = ['neighbourhood', 'amenities', 'amenities_list']
X_train = X_train.drop(columns=cols_to_remove)
X_test = X_test.drop(columns=cols_to_remove)

display(X_train.head())

In [None]:
categorical_cols = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city']

X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

print("One-hot encoding complete.")
display(X_train.head())

In [None]:
X_train.info()

In [None]:
def remove_near_zero_variance(X_train, X_test, threshold=0.01):
    """
    Identifies and removes features with near-zero variance from the training
    and test sets.

    Args:
        X_train (pd.DataFrame): The training features DataFrame.
        X_test (pd.DataFrame): The test features DataFrame.
        threshold (float): The variance threshold. Features with variance below
                           this value will be removed.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    print("Removing Near-Zero Variance Features")

    numerical_cols = X_train_processed.select_dtypes(include=np.number).columns.tolist()

    variances = X_train_processed[numerical_cols].var()

    low_variance_cols = variances[variances < threshold].index.tolist()

    if not low_variance_cols:
        print("No near-zero variance features found.")
        return X_train_processed, X_test_processed

    print(f"Found {len(low_variance_cols)} near-zero variance features to remove:")
    for col in low_variance_cols:
        print(f"  - {col} (Variance: {variances[col]:.4f})")

    X_train_processed.drop(columns=low_variance_cols, inplace=True)
    X_test_processed.drop(columns=low_variance_cols, inplace=True)

    print("\n Removal complete.")

    return X_train_processed, X_test_processed

In [None]:
X_train, X_test = remove_near_zero_variance(X_train, X_test, threshold=0.01)

In [None]:
X_train.info()

In [None]:
def remove_highly_correlated_features(X_train, X_test, threshold=0.9):
    """
    Identifies and removes one feature from each pair of highly correlated
    features from the training and test sets.

    Args:
        X_train (pd.DataFrame): The training features DataFrame.
        X_test (pd.DataFrame): The test features DataFrame.
        threshold (float): The correlation threshold. If the absolute value of
                           the correlation between two features is greater than
                           this value, one of them will be removed.

    Returns:
        tuple: A tuple containing the transformed X_train and X_test DataFrames.
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()

    print("Removing Highly Correlated Features")

    numerical_cols = X_train_processed.select_dtypes(include=np.number).columns.tolist()

    corr_matrix = X_train_processed[numerical_cols].corr().abs()

    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]

    if not to_drop:
        print("No highly correlated features found to remove.")
        return X_train_processed, X_test_processed

    print(f"Found {len(to_drop)} features to remove to break highly correlated pairs:")
    for col in to_drop:
        correlated_with = upper_tri[col][upper_tri[col] > threshold].index.tolist()
        print(f"  - Removing '{col}' (highly correlated with {correlated_with})")

    X_train_processed.drop(columns=to_drop, inplace=True)
    X_test_processed.drop(columns=to_drop, inplace=True)

    print("\n Removal complete.")

    return X_train_processed, X_test_processed

In [None]:
X_train, X_test = remove_highly_correlated_features(X_train, X_test, threshold=0.9)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, KFold

def perform_rfecv(X_train, y_train, X_test, estimator, min_features_to_select=1, step=1, cv=5):
    """
    Performs Recursive Feature Elimination with Cross-Validation (RFE-CV) to
    select the optimal number of features.

    Args:
        X_train (pd.DataFrame): The training features DataFrame.
        y_train (pd.Series): The training target variable.
        X_test (pd.DataFrame): The test features DataFrame.
        estimator: The supervised learning estimator with a 'fit' method and a
                   'feature_importances_' or 'coef_' attribute.
        min_features_to_select (int): The minimum number of features to select.
        step (int): The number of features to remove at each iteration.
        cv (int): The number of folds for cross-validation.

    Returns:
        tuple: A tuple containing:
               - X_train_selected (pd.DataFrame): Training data with only selected features.
               - X_test_selected (pd.DataFrame): Test data with only selected features.
               - selected_features (list): The list of names of the selected features.
    """
    print("Performing RFE with Cross-Validation")

    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)

    rfecv = RFECV(
        estimator=estimator,
        step=step,
        cv=kfold,
        scoring='neg_mean_squared_error',
        min_features_to_select=min_features_to_select,
        n_jobs=-1
    )

    print("Fitting RFECV")
    rfecv.fit(X_train, y_train)
    print("Fitting complete.")

    optimal_num_features = rfecv.n_features_
    print(f"\nOptimal number of features found: {optimal_num_features}")

    selected_features = X_train.columns[rfecv.support_].tolist()
    print("\nSelected features:")
    for f in selected_features:
        print(f"  - {f}")

    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    plt.figure(figsize=(10, 6))
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (Negative MSE)")
    plt.plot(range(min_features_to_select, len(rfecv.cv_results_['mean_test_score']) + min_features_to_select), rfecv.cv_results_['mean_test_score'])
    plt.axvline(x=optimal_num_features, color='r', linestyle='--', label=f'Optimal features = {optimal_num_features}')
    plt.title("RFE-CV Performance")
    plt.legend()
    plt.show()

    return X_train_selected, X_test_selected, selected_features

In [None]:
estimator = RandomForestRegressor(n_estimators=30, random_state=42, n_jobs=-1)

X_train_final, X_test_final, final_features = perform_rfecv(
    X_train, y_train, X_test, estimator=estimator, step=5, cv=3
)

print("\n--- Shape of X_train before RFE-CV:", X_train.shape)
print("--- Shape of X_train after RFE-CV:", X_train_final.shape)

In [None]:
save_directory = '/content/drive/MyDrive/homestay_price/'

X_train_final.to_csv(save_directory + 'X_train_final.csv', index=False)

X_test_final.to_csv(save_directory + 'X_test_final.csv', index=False)

print(f"X_train_final and X_test_final saved successfully to {save_directory}")

In [None]:
X_train_final.info()

In [None]:
!pip install optuna

In [None]:
import optuna
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Ridge


def objective_rf(trial, X, y):
    """Objective function for Random Forest hyperparameter tuning."""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
    }

    model = RandomForestRegressor(random_state=42, n_jobs=-1, **params)

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error').mean()

    return score

def objective_xgb(trial, X, y):
    """Objective function for XGBoost hyperparameter tuning."""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
    }

    model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **params)

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error').mean()

    return score


def tune_models(X_train, y_train, n_trials=50):
    """Runs Optuna studies to find the best hyperparameters for RF and XGB."""

    print("Tuning RandomForest")
    study_rf = optuna.create_study(direction='maximize')
    study_rf.optimize(lambda trial: objective_rf(trial, X_train, y_train), n_trials=n_trials)
    best_params_rf = study_rf.best_params
    print(f"Best RF Params: {best_params_rf}")

    print("\n Tuning XGBoost")
    study_xgb = optuna.create_study(direction='maximize')
    study_xgb.optimize(lambda trial: objective_xgb(trial, X_train, y_train), n_trials=n_trials)
    best_params_xgb = study_xgb.best_params
    print(f"Best XGB Params: {best_params_xgb}")

    return best_params_rf, best_params_xgb

In [None]:
best_rf, best_xgb = tune_models(X_train_final, y_train, n_trials=10)

In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge

def train_final_stacked_model(X_train, y_train, rf_params, xgb_params):
    print("Building and Training Final Stacked Ensemble")

    rf_model  = RandomForestRegressor(random_state=42, n_jobs=-1, **rf_params)
    xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, **xgb_params)
    meta_model = Ridge(random_state=42)

    stack = StackingRegressor(
        estimators=[('rf', rf_model), ('xgb', xgb_model)],
        final_estimator=meta_model,
        cv=5,
        passthrough=True,
        n_jobs=-1,
        verbose=0
    )

    stack.fit(X_train, y_train)
    print("Stacked model training complete.")
    return stack

# unchanged below
best_rf_params = {
    'n_estimators': 678, 'max_depth': 37,
    'min_samples_split': 10, 'min_samples_leaf': 6,
    'max_features': 0.4336999565791009
}
best_xgb_params = {
    'n_estimators': 685, 'max_depth': 9,
    'learning_rate': 0.08324843697516912,
    'subsample': 0.7924504745698256,
    'colsample_bytree': 0.9323549613130819,
    'gamma': 0.3283587776863417
}

final_model = train_final_stacked_model(X_train_final, y_train, best_rf_params, best_xgb_params)
test_predictions = final_model.predict(X_test_final)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2 = r2_score(y_test, test_predictions)

print(f"\nR2 Score on the test set: {r2:.4f}")