<a href="https://colab.research.google.com/github/chiragkhachane/real-estate-anaytics/blob/main/Third_Estate_Risk_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

# Load the dataset with latin-1 encoding
data = pd.read_csv("MasterData.csv")

# Select relevant columns
selected_columns = ['Print Key', 'Code_Description', 'Comments', 'Prop Class Description', 'Code', 'Owner1', 'Owner2',
                   'Neighborhood', 'Dist Name', 'Year Built', 'Distinct count of Case Reference', '#ofBaths', '#ofBeds',
                   'GEOID20 blockgroup', 'Numberof Units', 'Total Value', 'Mortgage', 'CostPerSQFT']
data = data[selected_columns]

# Drop rows with missing values
data.dropna(inplace=True)

# Treat 'Print Key' as string and target variable
data['Print Key'] = data['Print Key'].astype(str)

# Preprocess numerical columns
numerical_columns = ['Year Built', 'Distinct count of Case Reference', '#ofBaths', '#ofBeds', 'Numberof Units', 'Total Value', 'Mortgage', 'CostPerSQFT']
for column in numerical_columns:
    data[column] = data[column].replace(',', '', regex=True).astype(float)  # Remove commas and convert to float

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode textual data
encoded_texts = tokenizer(data['Code_Description'].tolist(), padding=True, truncation=True, return_tensors='tf')

# Extract contextualized embeddings from BERT model
bert_outputs = bert_model(encoded_texts)
bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]

# Define custom weights for each attribute
custom_weights = {
    'Code_Description': 0.7,
    'Comments': 0.4,
    'Prop Class Description': 0.1,
    'Code': 0.05,
    'Owner1': 0.05,
    'Owner2': 0.05,
    'Neighborhood': 0.1,
    'Dist Name': 0.05,
    'Year Built': 0.05,
    'Distinct count of Case Reference': 0.4,
    '#ofBaths': 0.05,
    '#ofBeds': 0.05,
    'GEOID20 blockgroup': 0.1,
    'Numberof Units': 0.1,
    'Total Value': 0.2,
    'Mortgage': 0.8,
    'CostPerSQFT': 0.2
}

# Combine BERT embeddings with other features
features = tf.concat([bert_embeddings,
                      data[['Year Built', 'Distinct count of Case Reference', '#ofBaths', '#ofBeds', 'GEOID20 blockgroup',
                            'Numberof Units', 'Total Value', 'Mortgage', 'CostPerSQFT']].values], axis=1)

# Calculate custom weights for features
custom_weighted_features = features.numpy().copy()
for feature, weight in custom_weights.items():
    if feature == 'Code_Description':
        feature_index = 0
    else:
        feature_index = data.columns.get_loc(feature) - len(encoded_texts.keys())  # Adjust index for BERT embeddings
    custom_weighted_features[:, feature_index] *= weight

# Feature scaling
scaler = StandardScaler()
custom_weighted_features = scaler.fit_transform(custom_weighted_features)

# Define target variable
y = data['Print Key']

# Initialize label encoder for target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initialize Random Forest Regression model
rf_model = RandomForestRegressor(n_estimators=100)

# Train the Random Forest Regression model on the entire dataset
rf_model.fit(custom_weighted_features, y_encoded)

# Predict risk scores for all print keys
all_predictions = rf_model.predict(custom_weighted_features)

# Create a DataFrame with print keys and predicted risk scores
predictions_df = pd.DataFrame({'Print Key': data['Print Key'], 'Predicted_Risk_Score': all_predictions})

# Export predictions to a CSV file
predictions_df.to_csv('predicted_risk_scores2.csv', index=False)