In [29]:


# Set random seed for reproducibility
import zipfile
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFECV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
import hdbscan
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from datetime import datetime
import time
import folium
from folium.plugins import MarkerCluster
import random
import swifter
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Download NLTK resources
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


##################################################################################################

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
df = pd.read_csv('test_data_reg.csv')    
target_col = 'review_scores_rating'
price_columns = ['nightly_price', 'price_per_stay', 'security_deposit', 'cleaning_fee', 'extra_people']
for col in price_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)

# 3. Convert percentage string to float
if 'host_response_rate' in df.columns and df['host_response_rate'].dtype == 'object':
    df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype(float) / 100


drop_cols = ['host_acceptance_rate', 'square_feet']
df = df.drop(drop_cols, axis=1)
df.drop(['host_listings_count'], axis=1, inplace=True)
df.drop("thumbnail_url", axis = 1, inplace = True)

duplicate_count = df.duplicated().sum()
if duplicate_count > 0:
    df = df.drop_duplicates()
    print(f"Duplicates dropped. New shape: {df.shape}")

#save scaler and imputer -1
with open(r'D:\fcis\machine\project\pkls2\modesToReplaceNulls.pkl', 'rb') as file:
  modesForPickle =pickle.load(file)



# Check the type and keys
print(type(modesForPickle))
if isinstance(modesForPickle, dict):
    print(modesForPickle.keys())
else:
    print("The loaded object is not a dictionary.")

cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(modesForPickle[col])


df.drop("zipcode", axis = 1, inplace = True)
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
x = pd.DataFrame(num_cols, columns=['Numeric Columns'])

# Create a scaler for all numeric columns
with open(r'D:\fcis\machine\project\pickleFiles\reg_scalerModel.pkl', 'rb') as file:
  scaler= pickle.load(file)

scaled_data = scaler.transform(df[num_cols])



with open(r'D:\fcis\machine\project\pickleFiles\re_imputerModel.pkl', 'rb') as file:
  imputer=pickle.load(file)

imputed_data = imputer.transform(scaled_data)

df[num_cols] = pd.DataFrame(scaler.inverse_transform(imputed_data), columns=num_cols, index=df.index)





skewed_cols = ['nightly_price', 'price_per_stay', 'number_of_reviews', 'host_total_listings_count', 'number_of_stays']

for col in skewed_cols:
    df[f'{col}'] = np.log1p(df[col])



with open(r'D:\fcis\machine\project\pkls2\lowerUpperOutliersForPickle.pkl', 'rb') as file:
    lowerUpperOutliers= pickle.load(file)



numeric_cols = df.select_dtypes(include="number").columns

for col in numeric_cols:
    ##get the outlier bounds from pickle then filter
    df[col] = np.where(df[col] < lowerUpperOutliers[col][0], lowerUpperOutliers[col][0], df[col])
    df[col] = np.where(df[col] > lowerUpperOutliers[col][1], lowerUpperOutliers[col][1], df[col])

host_total_listings_count = df.groupby('host_id')['host_id'].count().to_dict()
df['host_experience'] = np.log1p(df['host_id'].map(lambda x: host_total_listings_count.get(x, 0)))  # Default to 0 if missing

host_avg_price = df.groupby('host_id')['price_per_stay'].mean().to_dict()
df['host_avg_price'] = df['host_id'].map(host_avg_price)

host_total_reviews = df.groupby('host_id')['number_of_reviews'].sum().to_dict()
df['host_total_reviews'] = df['host_id'].map(host_total_reviews)


host_avg_reviews = df.groupby('host_id')['number_of_reviews'].mean()
df['host_avg_reviews'] = df['host_id'].map(host_avg_reviews)

host_avg_price = df.groupby('host_id')['nightly_price'].mean()
df['host_avg_price'] = df['host_id'].map(host_avg_price)
df['host_total_listings_count_log'] = np.log1p(df['host_total_listings_count'])

df['host_experience'] = df['host_total_listings_count_log']
df['price_per_stay'] = np.log1p(df['price_per_stay'])

df['super_experience'] = df['host_experience'] * (df['host_is_superhost'] == 't').astype(int)
df['reviews_per_stay'] = df['number_of_reviews'] / (df['number_of_stays'] + 1)
df['price_per_person'] = df['price_per_stay'] / df['accommodates']
df['price_diff'] = df['price_per_stay'] - df['nightly_price']

host_avg_response = df.groupby('host_id')['host_response_rate'].mean()

df['host_avg_response_rate'] = df['host_id'].map(host_avg_response)

df['price_per_guest'] = df['price_per_stay'] / df['accommodates']


with open(r'D:\fcis\machine\project\pkls2\LabelEncoder_for_host_is_superhost.pkl', 'rb') as file:
    le= pickle.load(file)

df['host_is_superhost'] = le.transform(df['host_is_superhost'])

df['superhost_reviews_interaction'] = df['host_is_superhost'] * df['number_of_reviews']
df['superhost_price_interaction'] = df['host_is_superhost'] * df['price_per_stay']

df['superhost_experience_interaction'] = df['host_is_superhost'] * df['host_experience']
df['superhost_reviews_interaction2'] = df['host_is_superhost'] * df['host_total_reviews']
df['reviews_per_listing'] = df['host_total_reviews'] / (df['host_total_listings_count'] + 1)

from sklearn.cluster import KMeans
import numpy as np


cluster_features = df[['host_total_listings_count', 'host_experience', 'host_total_reviews']].fillna(0)

with open(r'D:\fcis\machine\project\pickleFiles\clusterScaled.pkl', 'rb') as file:
    scaler=pickle.load(file)

cluster_scaled = scaler.fit_transform(cluster_features)

with open(r'D:\fcis\machine\project\pickleFiles\kmeanForHostCluster.pkl', 'rb') as file:
    kmeans=pickle.load(file)

df['host_cluster'] = kmeans.fit_predict(cluster_scaled)


url_cols = ['listing_url', 'host_url']
id_cols = ['id', 'host_id']
cols_to_drop = id_cols + url_cols
df.drop(cols_to_drop, axis = 1 , inplace = True)

date_cols = ['host_since', 'first_review', 'last_review']

for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Create features from dates
reference_date = pd.Timestamp('2023-01-01')  # Use a fixed reference date for consistency
for col in date_cols:
    if col in df.columns:
        df[f'{col}_days'] = (reference_date - df[col]).dt.days

        # Extract month and year as cyclical features
        if not df[col].isna().all():
            df[f'{col}_month_sin'] = np.sin(2 * np.pi * df[col].dt.month / 12)
            df[f'{col}_month_cos'] = np.cos(2 * np.pi * df[col].dt.month / 12)
            df[f'{col}_year'] = df[col].dt.year

        # Drop original date columns
        df = df.drop(col, axis=1)


text_cols = ['name','access' , 'description', 'neighborhood_overview',  'interaction', 'house_rules', 'host_about']

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

def process_text(text):
    if pd.isna(text):
        return {
            'word_count': 0,
            'sentiment_compound': 0,
            'sentiment_pos': 0,
            'sentiment_neg': 0,
            'sentiment_neu': 0
        }

    # Clean text
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Count words
    word_count = len(text.split())

    # Get sentiment
    sentiment = sia.polarity_scores(text)

    return {
        'word_count': word_count,
        'sentiment_compound': sentiment['compound'],
        'sentiment_pos': sentiment['pos'],
        'sentiment_neg': sentiment['neg'],
        'sentiment_neu': sentiment['neu']
    }

def process_column(df, col):
    print(f"  Processing {col}...")
    text_features = df[col].apply(process_text)  # Apply process_text to each element

    # Convert list of dicts to dataframe
    text_features_df = pd.DataFrame(text_features.tolist())

    # Add column prefix
    text_features_df.columns = [f"{col}_{feat}" for feat in text_features_df.columns]

    return text_features_df

def apply_tfidf(df, col, max_features=20):
    MODEL_DIR = "mo/mo"
    print(f"Applying TF-IDF on: {col}")

    # Load the saved TF-IDF vectorizer
    tfidf = joblib.load(os.path.join(MODEL_DIR, f"{col}_tfidf_vectorizer.joblib"))

    # Fill NaNs with empty strings and transform using the loaded vectorizer
    tfidf_matrix = tfidf.transform(df[col].fillna(""))

    # Load the column names
    tfidf_columns = joblib.load(os.path.join(MODEL_DIR, f"{col}_tfidf_columns.joblib"))

    # Create a DataFrame from TF-IDF matrix with the loaded column names
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_columns)

    return tfidf_df

# Process each text column for sentiment analysis and word count
for col in text_cols:
    if col in df.columns:
        text_features_df = process_column(df, col)

        df = pd.concat([df, text_features_df], axis=1)

        #tfidf_df = apply_tfidf(df, col)

        #df = pd.concat([df, tfidf_df], axis=1)

        df = df.drop(col, axis=1)


df['host_sentiment_label'] = df['host_about_sentiment_compound'].apply(
    lambda x: 'positive' if x > 0.5 else 'neutral' if x > 0 else 'negative'
)

df['overall_sentiment_score'] = (
    df[['host_about_sentiment_compound']].mean(axis=1)
)

cols_to_drop = [
    col for col in df.columns
    if col.endswith('_sentiment_neg') or
       col.endswith('_sentiment_neu') or
       col.startswith('name_sentiment')
]
df.drop(columns=cols_to_drop, inplace=True)

# Interaction between reviews and host features
if 'number_of_reviews' in df.columns and 'host_is_superhost' in df.columns:
    # Convert binary text values to numeric if needed
    if df['host_is_superhost'].dtype == 'object':
        df['host_is_superhost'] = df['host_is_superhost'].map({'t': 1, 'f': 0})

    df['superhost_review_interaction'] = df['host_is_superhost'] * df['number_of_reviews']

df['price_per_person'] = np.log1p(df['price_per_person'])
df = df.drop(['amenities'], axis=1)
df.drop(['property_type'], axis=1, inplace=True)
df.drop(['host_has_profile_pic', 'require_guest_profile_picture','require_guest_phone_verification','requires_license','is_business_travel_ready' ], axis=1, inplace=True)

binary_cols = ['host_is_superhost',  'host_identity_verified',
               'is_location_exact', 'instant_bookable']
for col in binary_cols:
    if col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].map({'t': 1, 'f': 0})


df = df.drop('city', axis=1)

# # Price-related ratios
if all(col in df.columns for col in ['price_per_stay', 'nightly_price', 'cleaning_fee']):
    df['price_to_nightly_ratio'] = df['price_per_stay'] / (df['nightly_price'] + 0.001)  # Avoid division by zero

# Reviews-related ratios
if all(col in df.columns for col in ['number_of_reviews', 'number_of_stays']):
    df['review_rate'] = df['number_of_reviews'] / (df['number_of_stays'] + 0.001)

# 9. Remove high cardinality categorical columns
high_cardinality_cols = ['host_location', 'host_neighbourhood', 'country', 'city', 'street',
                         'neighbourhood', 'neighbourhood_cleansed', 'zipcode', 'market', 'smart_location']

for col in high_cardinality_cols:
    if col in df.columns:
        df = df.drop(col, axis=1)

with open(r'D:\fcis\machine\project\pickleFiles\host_name_target_means.pkl', 'rb') as file:
            target_means= pickle.load(file)

df['host_name_target_encoded'] = df['host_name'].map(target_means)
with open(r'D:\fcis\machine\project\pkls2\host_name_value_counts.pkl', 'rb') as file:
            value_counts= pickle.load(file)

df['host_name_freq_encoded'] = df['host_name'].map(value_counts)

with open(r'D:\fcis\machine\project\pickleFiles\selected_features.pkl', 'rb') as file:
    selected_features= pickle.load(file)

df = df[selected_features.tolist() + [target_col]]

bool_cols = df.select_dtypes(include=['bool']).columns

# Convert boolean columns to integers (0 for False, 1 for True)
df[bool_cols] = df[bool_cols].astype(int)

X = df.drop(columns=[target_col])
y = df[target_col]
with open(r'D:\fcis\machine\project\pickleFiles\scaleFeatures.pkl', 'rb') as file:
    scaler= pickle.load(file)

X_scaled = scaler.transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=selected_features)

#models

with open(r'D:\fcis\machine\project\pickleFiles\linearRegression.pkl', 'rb') as file:
    lin_reg= pickle.load(file)


y_pred_test = lin_reg.predict(X_scaled)

r2_LR = r2_score(y, y_pred_test)
mse = mean_squared_error(y, y_pred_test)
mae = mean_absolute_error(y, y_pred_test)
rmse = np.sqrt(mse)

with open(r'D:\fcis\machine\project\pickleFiles\huberModel.pkl', 'rb') as file:
    huber= pickle.load(file)

y_pred_huber = huber.predict(X_scaled)

r2_hyper = r2_score(y, y_pred_huber)
print("Huber Regression R²:",r2_hyper )
print("Huber Regression MAE:", mean_absolute_error(y, y_pred_huber))
print("Huber Regression RMSE:", mean_squared_error(y, y_pred_huber))#, squared=False

with open(r'D:\fcis\machine\project\pickleFiles\linearLogTransformation.pkl', 'rb') as file:
    lin_reg=pickle.load(file)

y_pred_log = lin_reg.predict(X)
y_pred_exp = np.exp(y_pred_log)
print("Log-Transformed Regression R²:", r2_score(y, y_pred_exp))



with open(r'D:\fcis\machine\project\pickleFiles\LinearOriginal.pkl', 'rb') as file:
    model_original=pickle.load(file)

with open(r'D:\fcis\machine\project\pickleFiles\Log Transform.pkl', 'rb') as file:
    model_log= pickle.load(file)

with open(r'D:\fcis\machine\project\pickleFiles\Sqrt Transform.pkl', 'rb') as file:
    model_sqrt=pickle.load(file)

with open(r'D:\fcis\machine\project\pickleFiles\Polynomial.pkl', 'rb') as file:
    model_poly= pickle.load(file)

with open(r'D:\fcis\machine\project\pickleFiles\HuberModel2.pkl', 'rb') as file:
    model_robust =pickle.load(file)

with open(r'D:\fcis\machine\project\pickleFiles\RANSAC.pkl', 'rb') as file:
    model_ransac= pickle.load(file)


models = {
    'Linear': model_original,
    # 'Log Transform': model_log,
    # 'Sqrt Transform': model_sqrt,
    'Polynomial': model_poly,
    # 'Huber': model_robust,
    # 'RANSAC': model_ransac
}

with open(r'D:\fcis\machine\project\pickleFiles\X_polyTransform.pkl', 'rb') as file:
    poly= pickle.load(file)

X_poly = poly.transform(X)


for name, model in models.items():
    if name in ['Log Transform', 'Sqrt Transform']:
        if name == 'Log Transform':
            preds = np.expm1(model.predict(X))
        else:
            preds = model.predict(X) ** 2
    else:
        if name == 'Polynomial':
            preds = model.predict(X_poly)
        elif name =='Log-Transformed Regression':
          continue
        else:
            preds = model.predict(X)

    mse = mean_squared_error(y, preds)
    r2 = r2_score(y, preds)

    print(f"{name} = MSE: {mse:.4f}, R²: {r2:.4f}")



with open(r'D:\fcis\machine\project\pickleFiles\elastic_net_model.pkl', 'rb') as file:
    elastic_net_model=pickle.load(file)

predict_en = elastic_net_model.predict(X_scaled)

r2_en = r2_score(y, predict_en)
mse_en = mean_squared_error(y, predict_en)
mae_en = mean_absolute_error(y, predict_en)

print("ElasticNet model R²: {:.4f}".format(r2_en))
print("ElasticNet model MAE: {:.4f}".format(mae_en))


with open(r'D:\fcis\machine\project\pickleFiles\best_DT_model.pkl', 'rb') as file:
    best_DT_model= pickle.load(file)

# Make predictions on the test set
y_pred = best_DT_model.predict(X)

# Evaluate the model
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

with open(r'D:\fcis\machine\project\pickleFiles\DecisionTreeRegressor.pkl', 'rb') as file:
    reg= pickle.load(file)
y_pred_test = reg.predict(X)
# R² Scores

print('Decision_Tree R2_Test: {0:0.4f}'.format(r2_score(y, y_pred_test)))


<class 'dict'>
dict_keys(['summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_is_superhost', 'host_neighbourhood', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'state', 'zipcode', 'market'])
  Processing name...
  Processing access...
  Processing description...
  Processing neighborhood_overview...
  Processing interaction...
  Processing house_rules...
  Processing host_about...
Huber Regression R²: 0.37394736490797753
Huber Regression MAE: 2.362343820716811
Huber Regression RMSE: 12.506947579458668
Log-Transformed Regression R²: -460.0656048805365
Linear = MSE: 11.7074, R²: 0.4140
Polynomial = MSE: 11.2697, R²: 0.4359
ElasticNet model R²: 0.4131
ElasticNet model MAE: 2.4051
Mean Squared Error (MSE): 11.9227
Mean Absolute Error (MAE): 2.3371
R² Score: 0.4032
Decision_Tree R2_Test: 0.3993


