In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import json
def drop_ar_column(df):
    columns_to_drop = [col for col in df.columns if col.endswith('_ar')]
    df.drop(columns=columns_to_drop, inplace=True)
    return df

def get_column_types(df):
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    return categorical_columns, numerical_columns


def encode_categorical_variables(df):
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    encoder = OrdinalEncoder()

    encoded_df = df.copy()
    encoded_df[categorical_columns] = encoder.fit_transform(df[categorical_columns])

    return encoded_df, encoder
def convert_date_column(df, columns_to_convert):
    for col in columns_to_convert:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")
    return df

from sklearn.impute import SimpleImputer

def handle_missing_data(df):
    # Identify numeric columns that have missing values
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    numeric_columns = [col for col in numeric_columns if df[col].isnull().any() and not col.endswith('_id')]

    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_columns = [col for col in categorical_columns if df[col].isnull().any() and not col.endswith('_id')]

    numeric_imputer = SimpleImputer(strategy='median')
    if numeric_columns:
        df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    if categorical_columns:
        df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns]).astype('object')
    return df


def feature_engineering(df):
    df['room_density'] = df['property_size'] / df['rooms']
    if 'contract_start_date' in df.columns and 'contract_end_date' in df.columns:
        df['total_agreement_period'] = (df['contract_end_date'] - df['contract_start_date']).dt.days
    df.drop(columns=['contract_start_date', 'contract_end_date','property_size', 'rooms'], inplace=True)   
    return df

def scale_numerical_features(df):
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df

def preprocess_data(df, columns_to_convert):
    print("Dropping arabic column...")
    df = drop_ar_column(df)
    print("converting date column to date formate...")
    df = convert_date_column(df, columns_to_convert)
    print("Handling missing values...")
    df = handle_missing_data(df)
    print("Encoding catagorical values...")
    df = encode_categorical_variables(df)
    print("Scaling....")
    df = scale_numerical_features(df)
    print("Performing feature engineering...")
    df = feature_engineering(df)
    print("data processing is done moving to next stage...")
    return df

In [4]:
import json
config = json.load(open("./../config/config.json"))
path = config['path_for_rent_data']
columns_to_convert = ['registration_date', 'contract_start_date', 'contract_end_date','req_from','req_to','meta_ts']
df = pd.read_csv(path)
df = df[:1000]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df.head()

Unnamed: 0,ejari_contract_number,registration_date,contract_start_date,contract_end_date,version_number,version_text,contract_amount,annual_amount,is_freehold,is_freehold_text,...,nearest_mall_en,nearest_mall_ar,master_project_en,master_project_ar,ejari_property_type_id,ejari_property_sub_type_id,req_from,req_to,entry_id,meta_ts
0,120130625001365,2024-01-24 11:14:33,2024-01-01,2024-12-31,12,Renewed,58000.0,58000.0,f,Non Free Hold,...,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
1,120130626005726,2024-01-09 16:48:47,2024-03-18,2025-03-17,12,Renewed,88000.0,88000.0,t,Free Hold,...,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
2,120130626007829,2024-01-16 11:39:26,2024-01-01,2024-12-31,12,Renewed,94000.0,94000.0,f,Non Free Hold,...,Dubai Mall,مول دبي,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
3,120130627008661,2024-01-15 18:28:05,2024-01-01,2024-12-31,12,Renewed,66000.0,66000.0,f,Non Free Hold,...,Mall of the Emirates,مول الإمارات,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
4,120130627009439,2024-01-22 13:34:29,2024-01-01,2024-12-31,12,Renewed,31200.0,31200.0,f,Non Free Hold,...,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859


In [6]:
print("Dropping arabic column...")
df = drop_ar_column(df)
print("converting date column to date formate...")
df = convert_date_column(df, columns_to_convert)

Dropping arabic column...
converting date column to date formate...


In [7]:
print("Handling missing values...")
df = handle_missing_data(df)
print("Encoding catagorical values...")
df = encode_categorical_variables(df)
print("Scaling....")
df = scale_numerical_features(df)
print("Performing feature engineering...")
df = feature_engineering(df)

Handling missing values...


ValueError: Columns must be same length as key

In [8]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_columns = [col for col in numeric_columns if df[col].isnull().any() and not col.endswith('_id')]

categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns = [col for col in categorical_columns if df[col].isnull().any() and not col.endswith('_id')]

numeric_imputer = SimpleImputer(strategy='median')
if numeric_columns:
    df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

In [13]:
handle_missing_data(df)

KeyError: 0

In [11]:
df[categorical_columns]

Unnamed: 0,property_subtype_en,property_usage_en,project_name_en,nearest_landmark_en,nearest_metro_en,nearest_mall_en,master_project_en
0,Flat,Residential,,Dubai International Airport,Al Nahda Metro Station,City Centre Mirdif,
1,Flat,Residential,,Dubai International Airport,Rashidiya Metro Station,City Centre Mirdif,
2,Shop,Commercial,,Dubai International Airport,Salah Al Din Metro Station,Dubai Mall,
3,Flat,Residential,,Burj Al Arab,Sharaf Dg Metro Station,Mall of the Emirates,
4,Flat,Residential,,Dubai International Airport,Airport Free Zone,City Centre Mirdif,
...,...,...,...,...,...,...,...
995,Warehouse,Commercial,,Burj Al Arab,First Abu Dhabi Bank Metro Station,Mall of the Emirates,
996,Complex Villas,Residential,,Burj Al Arab,First Abu Dhabi Bank Metro Station,Mall of the Emirates,
997,Shop,Commercial,,Burj Al Arab,Jumeirah Beach Residency,Marina Mall,
998,Office,Commercial,,Dubai International Airport,STADIUM Metro Station,City Centre Mirdif,


In [9]:
categorical_imputer = SimpleImputer(strategy='most_frequent')
if categorical_columns:
    df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns]).astype('object')


ValueError: Columns must be same length as key