In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('Transactions.csv')

In [5]:
df['instance_date'] = df['instance_date'].str.replace('/', '-', regex=False)

df['instance_date'] = pd.to_datetime(df['instance_date'], format='%d-%m-%Y', errors='coerce')

In [7]:
df = df[df['instance_date'] >= '2013-01-01']

In [9]:
df = df.dropna(subset=['rooms_en'])

In [11]:
df['property_type_en'] = df['property_type_en'].replace('Unit', 'Apartment')

In [None]:
df = df[~df['property_usage_en'].isin(['Hospitality', 'Storage'])]

In [None]:
df = df[df['property_usage_en'].str.strip() != 'Hospitality']

In [13]:
drop_values = ['PENTHOUSE', 'Single Room', 'Store','GYM', '8 B/R', '9 B/R']

# Step 3: Filter out rows that have these values in 'rooms_en'
df = df[~df['rooms_en'].isin(drop_values)]

In [15]:
keep_values = ['1 B/R', '2 B/R', 'Studio', '3 B/R', 'Office', '4 B/R', '5 B/R', '6 B/R', '7 B/R']

In [19]:
df = df[df['rooms_en'].str.strip() != 'Shop']

In [None]:
df['rooms_en'] = df['rooms_en'].apply(lambda x: x if x in keep_values else 'Others')

In [21]:
rooms_frequency = df['rooms_en'].value_counts()
rooms_frequency

rooms_en
1 B/R     217595
2 B/R     156843
Studio    112080
3 B/R      93846
4 B/R      31474
Office     28358
5 B/R       3119
6 B/R        192
7 B/R         29
Name: count, dtype: int64

In [29]:
average_procedure_by_room = df.groupby('rooms_en')['procedure_area'].mean()

# Display the average procedure area for each room type
print("Average procedure_area for each room type:")
print(average_procedure_by_room)

Average procedure_area for each room type:
rooms_en
1 B/R       76.046924
2 B/R      128.950931
3 B/R      209.961288
4 B/R      323.585627
5 B/R      512.383094
6 B/R     1055.768542
7 B/R     1644.627586
Office     132.826357
Studio      42.342994
Name: procedure_area, dtype: float64


In [31]:
average_procedure_by_room = df.groupby('rooms_en')['procedure_area'].mean().reset_index()

# Rank the room types based on their average procedure_area (smallest gets 1)
average_procedure_by_room['room_value'] = average_procedure_by_room['procedure_area'].rank(method='dense', ascending=True).astype(int)

# Merge the ranked values back into the original dataframe
df = pd.merge(df, average_procedure_by_room[['rooms_en', 'room_value']], on='rooms_en', how='left')

# Display the dataframe with room values assigned
# df[['rooms_en', 'procedure_area', 'room_value']].head(15)
# df[['rooms_en', 'procedure_area', 'room_value']].drop_duplicates()

unique_rooms_en = df['rooms_en'].unique()
unique_procedure_area = df['procedure_area'].unique()
unique_room_value = df['room_value'].unique()

# Print each unique set of values
print("Unique rooms_en values:", unique_rooms_en)
print("Unique procedure_area values:", unique_procedure_area)
print("Unique room_value values:", unique_room_value)

Unique rooms_en values: ['Office' '1 B/R' '2 B/R' 'Studio' '3 B/R' '4 B/R' '5 B/R' '6 B/R' '7 B/R']
Unique procedure_area values: [130.13 268.94  63.67 ... 275.28  12.75 362.74]
Unique room_value values: [4 2 3 1 5 6 7 8 9]


In [None]:
df.head()

In [None]:
df.drop(columns=['transaction_id', 'instance_date', 'property_sub_type_en', 'rooms_en', 'nearest_metro_en', 'has_parking'], inplace=True)

In [None]:
df['reg_type_en'] = df['reg_type_en'].replace('Existing Properties', 'Ready Property')
df['reg_type_en'] = df['reg_type_en'].replace('Off-Plan Properties', 'Off-Plan')

In [None]:
df.head()

In [None]:
import category_encoders as ce
import pickle

# Assuming 'data' is your DataFrame

# List of categorical columns to encode
categorical_columns = ['trans_group_en', 'property_type_en', 'property_usage_en', 'reg_type_en', 'area_name_en']

# Initialize the binary encoder
encoder = ce.BinaryEncoder(cols=categorical_columns)

# Fit and transform to produce binary encoded data
data = encoder.fit_transform(df.drop('actual_worth', axis=1))

# Display the head of the DataFrame to see some of the encoded features

with open('3encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)


# Display the new structure of the DataFram
data.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'X' are your features and 'y' is the target variable
X_train, X_test, y_train, y_test = train_test_split(data, np.log(df['actual_worth']), test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor 
from scipy.stats import randint

# param_dist = {
#     'n_estimators': randint(100, 300),
#     'max_depth': randint(10, 30)
# }

# random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1)
# random_search.fit(X_train, y_train)

# best_model = random_search.best_estimator_

X_subset, _, y_subset, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)

param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(10, 30),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11)
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2)
random_search.fit(X_subset, y_subset)

best_model = random_search.best_estimator_

print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

In [None]:
best_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
y_pred = best_model.predict(X_test)

# Calculate R²
r2 = r2_score(y_test, y_pred)
print("R²:", r2)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Prepare data
X = data
y = np.log(df['actual_worth'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
gbm = GradientBoostingRegressor(random_state=42)

# Set up the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model
best_gbm = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_gbm.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Best Parameters:", grid_search.best_params_)
print("Test Set RMSE:", rmse)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the final model with the best parameters
final_model = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=200, random_state=42)

# Fit the model on the entire dataset
X = data
y = np.log(df['actual_worth'])
final_model.fit(X, y)

In [None]:
file = open('2predictor.pkl','wb')
pickle.dump(best_model,file)
file.close()

In [None]:
df.isnull().sum()

In [None]:
def fill_property_sub_type_na(data):
    # Fill NaN values in 'property_sub_type_en' using values from 'property_type_en'
    data['property_sub_type_en'] = data.apply(
        lambda row: row['property_type_en'] if pd.isna(row['property_sub_type_en']) else row['property_sub_type_en'],
        axis=1
    )
    
    return data


# Apply the function to the dataset
df = fill_property_sub_type_na(df)

In [None]:
sub_type_frequency = df['property_sub_type_en'].value_counts()
sub_type_frequency

In [None]:
type_frequency = df['property_type_en'].value_counts()
type_frequency

In [None]:
df = df[df['property_usage_en'].str.strip() != 'Hospitality']

In [None]:
usage_frequency = df['property_usage_en'].value_counts()
usage_frequency

In [None]:
rooms_frequency = df['rooms_en'].value_counts()
rooms_frequency

In [None]:
rooms_frequency = df['rooms_en'].value_counts()
rooms_frequency

In [None]:
rooms_frequency = df['rooms_en'].value_counts()
rooms_frequency

In [None]:
rooms_frequency = df['rooms_en'].value_counts()
rooms_frequency

In [None]:
df.head(10)

In [None]:
property_type_en_frequency = df['property_type_en'].value_counts()
property_type_en_frequency

In [None]:
average_procedure_by_room = df.groupby('rooms_en')['procedure_area'].mean()

# Display the average procedure area for each room type
print("Average procedure_area for each room type:")
print(average_procedure_by_room)

In [None]:
average_procedure_by_room = df.groupby('rooms_en')['procedure_area'].mean().reset_index()

# Rank the room types based on their average procedure_area (smallest gets 1)
average_procedure_by_room['room_value'] = average_procedure_by_room['procedure_area'].rank(method='dense', ascending=True).astype(int)

# Merge the ranked values back into the original dataframe
df2 = pd.merge(df, average_procedure_by_room[['rooms_en', 'room_value']], on='rooms_en', how='left')

# Display the dataframe with room values assigned
df2[['rooms_en', 'procedure_area', 'room_value']].head(2)

In [None]:
df.head()

In [None]:
df.drop(columns=['transaction_id', 'instance_date', 'property_sub_type_en', 'rooms_en', 'nearest_metro_en', 'has_parking'], inplace=True)

In [None]:
df['reg_type_en'] = df['reg_type_en'].replace('Existing Properties', 'Ready Property')
df['reg_type_en'] = df['reg_type_en'].replace('Off-Plan Properties', 'Off-Plan')

In [None]:
df.head()

In [None]:
df2.to_csv('Transaction2.csv')

In [None]:
reg_type_frequency = df['reg_type_en'].value_counts()
reg_type_frequency

In [None]:
from joblib import dump

In [None]:
import category_encoders as ce
import pickle

# Assuming 'data' is your DataFrame

# List of categorical columns to encode
categorical_columns = ['trans_group_en', 'property_type_en', 'property_usage_en', 'reg_type_en', 'area_name_en']

# Initialize the binary encoder
encoder = ce.BinaryEncoder(cols=categorical_columns)

# Fit and transform to produce binary encoded data
data = encoder.fit_transform(df.drop('actual_worth', axis=1))

# Display the head of the DataFrame to see some of the encoded features

with open('r_encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)


# Display the new structure of the DataFram
data.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'X' are your features and 'y' is the target variable
X_train, X_test, y_train, y_test = train_test_split(data, np.log(df['actual_worth']), test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor 
from scipy.stats import randint

# param_dist = {
#     'n_estimators': randint(100, 300),
#     'max_depth': randint(10, 30)
# }

# random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1)
# random_search.fit(X_train, y_train)

# best_model = random_search.best_estimator_

X_subset, _, y_subset, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)

param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(10, 30),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11)
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2)
random_search.fit(X_subset, y_subset)

best_model = random_search.best_estimator_

print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)


In [None]:
best_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
y_pred = best_model.predict(X_test)

# Calculate R²
r2 = r2_score(y_test, y_pred)
print("R²:", r2)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
file = open('r_predictor.pkl','wb')
pickle.dump(best_model,file)
file.close()

# dump(best_model, 'trans_encoder_j.joblib')

# Save the model
# dump(rf, 'trans_predictor_j.joblib')

In [None]:
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import mean_squared_error

# # Prepare data
# X = data
# y = np.log(df2['actual_worth'])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize the model
# gbm = GradientBoostingRegressor(random_state=42)

# # Set up the parameter grid
# param_grid = {
#     'n_estimators': [100, 200],
#     'learning_rate': [0.1, 0.05, 0.01],
#     'max_depth': [3, 4, 5]
# }

# # Set up GridSearchCV
# grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best model
# best_gbm = grid_search.best_estimator_

# # Predict and evaluate
# y_pred = best_gbm.predict(X_test)
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print("Best Parameters:", grid_search.best_params_)
# print("Test Set RMSE:", rmse)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Initialize the model
gbm = GradientBoostingRegressor(random_state=42)

# Set up the parameter grid
param_distributions = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5]
}

# Set up RandomizedSearchCV with parallelization
random_search = RandomizedSearchCV(
    estimator=gbm,
    param_distributions=param_distributions,
    n_iter=10,  # Number of parameter settings sampled
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42,
    verbose=1,
    n_jobs=-1  # Use all CPU cores
)

# Fit the model
random_search.fit(X_train, y_train)

# Best model
best_gbm = random_search.best_estimator_

# Predict and evaluate
y_pred = best_gbm.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Best Parameters:", random_search.best_params_)
print("Test Set RMSE:", rmse)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the final model with the best parameters
final_model = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=200, random_state=42)

# Fit the model on the entire dataset
X = data
y = np.log(df2['actual_worth'])
final_model.fit(X, y)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
y_pred = final_model.predict(X_test)

# Calculate R²
r2 = r2_score(y_test, y_pred)
print("R²:", r2)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
y_pred = final_model.predict(X_test)

# Calculate R²
r2 = r2_score(y_test, y_pred)
print("R²:", r2)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
file = open('q_predictor.pkl','wb')
pickle.dump(best_model,file)
file.close()

In [None]:
df.head()