In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
data = pd.read_csv('Real_Estate.csv')

In [5]:
data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Transaction date                     414 non-null    object 
 1   House age                            414 non-null    float64
 2   Distance to the nearest MRT station  414 non-null    float64
 3   Number of convenience stores         414 non-null    int64  
 4   Latitude                             414 non-null    float64
 5   Longitude                            414 non-null    float64
 6   House price of unit area             414 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 22.8+ KB


In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.optim as optim

# Load the dataset
try:
    df = pd.read_csv('Real_Estate.csv')
except FileNotFoundError:
    print("The file 'Real_Estate.csv' was not found. Please ensure it is in the correct directory.")
    exit()

# Check the dataframe
print("Dataset head:")
print(df.head())
print("\nDataset info:")
print(df.info())

# Convert 'Transaction date' to datetime
df['Transaction date'] = pd.to_datetime(df['Transaction date'])

# Extract textual features
df['Transaction date text'] = df['Transaction date'].astype(str)

# Prepare the features and labels
textual_features = df['Transaction date text'].tolist()
numerical_features = df[['House age', 'Distance to the nearest MRT station', 'Number of convenience stores', 'Latitude', 'Longitude']].values
labels = df['House price of unit area'].values

# Normalize numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(numerical_features)

# Tokenize textual data with BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_texts = tokenizer(textual_features, padding=True, truncation=True, return_tensors='pt')

# Extract BERT embeddings
model = BertModel.from_pretrained('bert-base-uncased')
with torch.no_grad():
    outputs = model(**encoded_texts)
    text_embeddings = outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token embedding

# Combine text embeddings with numerical features
combined_features = torch.cat((text_embeddings, torch.tensor(numerical_features, dtype=torch.float32)), dim=1)

# Define a regression model
class RegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model
input_dim = combined_features.shape[1]
regression_model = RegressionModel(input_dim)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(regression_model.parameters(), lr=0.001)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_features.numpy(), labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    regression_model.train()
    optimizer.zero_grad()
    outputs = regression_model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
regression_model.eval()
with torch.no_grad():
    test_outputs = regression_model(X_test)
    test_loss = criterion(test_outputs, y_test)
    print(f'Test Loss: {test_loss.item():.4f}')


Dataset head:
             Transaction date  House age  Distance to the nearest MRT station  \
0  2012-09-02 16:42:30.519336       13.3                            4082.0150   
1  2012-09-04 22:52:29.919544       35.5                             274.0144   
2  2012-09-05 01:10:52.349449        1.1                            1978.6710   
3  2012-09-05 13:26:01.189083       22.2                            1055.0670   
4  2012-09-06 08:29:47.910523        8.5                             967.4000   

   Number of convenience stores   Latitude   Longitude  \
0                             8  25.007059  121.561694   
1                             2  25.012148  121.546990   
2                            10  25.003850  121.528336   
3                             5  24.962887  121.482178   
4                             6  25.011037  121.479946   

   House price of unit area  
0                  6.488673  
1                 24.970725  
2                 26.694267  
3                 38.091638  


In [6]:

import pandas as pd
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Download NLTK data
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('Real_Estate.csv')

# Check the dataframe
print("Dataset head:")
print(df.head())
print("\nDataset info:")
print(df.info())

# Preprocess the 'Transaction date' column to extract meaningful text
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(WordNetLemmatizer().lemmatize(token, pos='v'))
    return result

# Apply preprocessing to each row in the 'Transaction date' column
processed_docs = df['Transaction date'].astype(str).map(preprocess)

# Create a dictionary representation of the documents
dictionary = gensim.corpora.Dictionary(processed_docs)

# Filter out words that occur less than 2 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=2, no_above=0.5)

# Create a bag-of-words representation of the documents
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Train the LDA model
lda_model = gensim.models.LdaModel(bow_corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

# Optional: Save the LDA model
# lda_model.save("lda_model_real_estate")

# Optional: Load the LDA model
# lda_model = LdaModel.load("lda_model_real_estate")



[nltk_data] Downloading package wordnet to C:\Users\Md Saiful
[nltk_data]     Islam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Dataset head:
             Transaction date  House age  Distance to the nearest MRT station  \
0  2012-09-02 16:42:30.519336       13.3                            4082.0150   
1  2012-09-04 22:52:29.919544       35.5                             274.0144   
2  2012-09-05 01:10:52.349449        1.1                            1978.6710   
3  2012-09-05 13:26:01.189083       22.2                            1055.0670   
4  2012-09-06 08:29:47.910523        8.5                             967.4000   

   Number of convenience stores   Latitude   Longitude  \
0                             8  25.007059  121.561694   
1                             2  25.012148  121.546990   
2                            10  25.003850  121.528336   
3                             5  24.962887  121.482178   
4                             6  25.011037  121.479946   

   House price of unit area  
0                  6.488673  
1                 24.970725  
2                 26.694267  
3                 38.091638  


ValueError: cannot compute LDA over an empty collection (no terms)