In [15]:
import pandas as pd
import os

file_paths = [
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Consumer Price Index\\Consumer_Price_Index_Annually.csv",
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Consumer Price Index\\Consumer_Price_Index_Monthly.csv",
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Consumer Price Index\\Consumer_Price_Index_Quarterly.csv"
]

output_dir = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Consumer Price Index\\"
os.makedirs(output_dir, exist_ok=True)

def clean_and_check_cpi_file(file_path):
    df = pd.read_csv(file_path)
    
    indx_rows = df[df['UNIT_MEASURE'] == 'INDX']
    
    if not indx_rows.empty:
        print(f"Found {len(indx_rows)} rows with 'INDX' in {file_path}")
        
        df = df.drop(columns=['OBS_STATUS', 'UNIT_MULT'])
        
        df = df.rename(columns={
            'TIME_PERIOD': 'Date',
            'OBS_VALUE': 'CPI Value',
            'MEASURE': 'CPI Measure'
        })
        
        df.loc[indx_rows.index, 'CPI Value'] = df.loc[indx_rows.index, 'CPI Value'].pct_change() * 100
        
        df.loc[indx_rows.index, 'UNIT_MEASURE'] = 'PERCENT'
        
        cleaned_file_path = output_dir + file_path.split("\\")[-1]
        df.to_csv(cleaned_file_path, index=False)
        print(f"Successfully cleaned and saved: {cleaned_file_path}")
        
        original_df = pd.read_csv(file_path)
        
        comparison = pd.concat([original_df, df], axis=1, keys=['Original', 'Cleaned'])
        comparison_path = output_dir + "Comparison_" + file_path.split("\\")[-1]
        comparison.to_csv(comparison_path, index=False)
        print(f"Comparison file saved: {comparison_path}")
    else:
        print(f"No 'INDX' rows found in {file_path}")

for file_path in file_paths:
    clean_and_check_cpi_file(file_path)


Found 185 rows with 'INDX' in D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Datasets Dubai DC 2025\Datasets\Consumer Price Index\Consumer_Price_Index_Annually.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Consumer_Price_Index_Annually.csv
Comparison file saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Comparison_Consumer_Price_Index_Annually.csv
Found 2630 rows with 'INDX' in D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Datasets Dubai DC 2025\Datasets\Consumer Price Index\Consumer_Price_Index_Monthly.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Consumer_Price_Index_Monthly.csv
Comparison file saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Comparison_Consumer_Price_Index_Monthly.csv


In [19]:
import pandas as pd

file_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Currency Strength\\AED-USD.csv"
aed_usd_df = pd.read_csv(file_path)

aed_usd_df = aed_usd_df.drop(columns=['Unnamed: 0', 'Volume'])

aed_usd_df['Date'] = pd.to_datetime(aed_usd_df['Date'])

output_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Currency Strength\\AED-USD_Cleaned.csv"
aed_usd_df.to_csv(output_path, index=False)

print(f"Successfully cleaned and saved: {output_path}")


Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Currency Strength\AED-USD_Cleaned.csv


In [53]:
import pandas as pd
import os

file_paths = [
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Gross Domestic Product\\GDP_Quarterly_Constant_Prices.csv",
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Gross Domestic Product\\GDP_Quarterly_Current_Prices.csv"
]

output_dir = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Gross Domestic Product\\"
os.makedirs(output_dir, exist_ok=True)

def clean_and_check_gdp_file(file_path):
    df = pd.read_csv(file_path)
    
    milaed_rows = df[df['UNIT_MEASURE'] == 'MILAED']
    
    if not milaed_rows.empty:
        print(f"Found {len(milaed_rows)} rows with 'MILAED' in {file_path}")
        
        df = df.drop(columns=['OBS_STATUS', 'UNIT_MULT', 'OBS_COMMENT'])
        
        df = df.rename(columns={
            'TIME_PERIOD': 'Year',
            'OBS_VALUE': 'GDP Value',
            'MEASURE': 'GDP Category'
        })
        
        df.loc[milaed_rows.index, 'GDP Value'] = df.loc[milaed_rows.index, 'GDP Value'].pct_change() * 100
        
        df.loc[milaed_rows.index, 'UNIT_MEASURE'] = 'PERCENT'
        
        cleaned_file_name = file_path.split("\\")[-1].replace(".csv", "_cleaned.csv")
        cleaned_file_path = os.path.join(output_dir, cleaned_file_name)
        
        df.to_csv(cleaned_file_path, index=False)
        print(f"Successfully cleaned and saved: {cleaned_file_path}")
        
        original_df = pd.read_csv(file_path)
        
        comparison = pd.concat([original_df, df], axis=1, keys=['Original', 'Cleaned'])
        comparison_path = os.path.join(output_dir, "Comparison_" + cleaned_file_name)
        comparison.to_csv(comparison_path, index=False)
        print(f"Comparison file saved: {comparison_path}")
    else:
        print(f"No 'MILAED' rows found in {file_path}")

for file_path in file_paths:
    clean_and_check_gdp_file(file_path)


Found 1000 rows with 'MILAED' in D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Datasets Dubai DC 2025\Datasets\Gross Domestic Product\GDP_Quarterly_Constant_Prices.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\GDP_Quarterly_Constant_Prices_cleaned.csv
Comparison file saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\Comparison_GDP_Quarterly_Constant_Prices_cleaned.csv
Found 1000 rows with 'MILAED' in D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Datasets Dubai DC 2025\Datasets\Gross Domestic Product\GDP_Quarterly_Current_Prices.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\GDP_Quarterly_Current_Prices_cleaned.csv
Comparison file saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\Com

In [55]:
import pandas as pd
import os

file_paths = [
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Population\\Population_Estimates_and_Growth_by_Gender.csv",
    "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Population\\Population_Indicators.csv"
]

output_dir = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Population\\"
os.makedirs(output_dir, exist_ok=True)

def clean_population_file(file_path):
    df = pd.read_csv(file_path)
    
    if 'DECIMALS' in df.columns:
        df = df.drop(columns=['DECIMALS'])
    
    df = df.rename(columns={
        'TIME_PERIOD': 'Year',
        'OBS_VALUE': 'Value',
        'POP_IND': 'Population Indicator',
        'UNIT_MEASURE': 'Unit'
    })
    
    cleaned_file_name = file_path.split("\\")[-1].replace(".csv", "_cleaned.csv")
    cleaned_file_path = os.path.join(output_dir, cleaned_file_name)
    
    df.to_csv(cleaned_file_path, index=False)
    print(f"Successfully cleaned and saved: {cleaned_file_path}")

for file_path in file_paths:
    clean_population_file(file_path)


Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Population\Population_Estimates_and_Growth_by_Gender_cleaned.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Population\Population_Indicators_cleaned.csv


In [61]:
import pandas as pd
import os

rents_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Rents & Transactions\\rents.csv"
transactions_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Rents & Transactions\\transactions.csv"

output_dir = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Rents & Transactions\\"
os.makedirs(output_dir, exist_ok=True)

def clean_file(file_path, file_name, output_dir, delimiter=';'):
    df = pd.read_csv(file_path, delimiter=delimiter, low_memory=False)
    
    cleaned_file_path = os.path.join(output_dir, f"{file_name}_cleaned.csv")
    df.to_csv(cleaned_file_path, index=False)
    print(f"Successfully cleaned and saved: {cleaned_file_path}")

clean_file(rents_path, "rents", output_dir)

clean_file(transactions_path, "transactions", output_dir)


Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Rents & Transactions\rents_cleaned.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Rents & Transactions\transactions_cleaned.csv


In [75]:
import pandas as pd

rents_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Rents & Transactions\\rents_cleaned.csv"
transactions_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Rents & Transactions\\transactions_cleaned.csv"

def remove_master_project_rents(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    
    if 'Master Project' in df.columns:
        df = df.drop(columns=['Master Project'])
    
    df.to_csv(file_path, index=False)
    print(f"Successfully cleaned and overwritten: {file_path}")

def remove_master_project_transactions(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    
    if 'Master Project' in df.columns:
        df = df.drop(columns=['Master Project'])
    
    df.to_csv(file_path, index=False)
    print(f"Successfully cleaned and overwritten: {file_path}")

remove_master_project_rents(rents_path)
remove_master_project_transactions(transactions_path)


Successfully cleaned and overwritten: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Rents & Transactions\rents_cleaned.csv
Successfully cleaned and overwritten: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Rents & Transactions\transactions_cleaned.csv


In [83]:
import pandas as pd
import os

guests_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Tourism\\Guests_by_Hotel_Type_by_Region.csv"
hotel_rooms_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Tourism\\Hotel_Establishments_and_Rooms_by_Rating_Type.csv"
hotel_indicators_path = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Datasets Dubai DC 2025\\Datasets\\Tourism\\Hotel_Establishments_Main_Indicators.csv"

output_dir = "D:\\Desights.ai\\Dubai Real Estate Price Prediction Challenge\\Cleaned Files\\Tourism\\"
os.makedirs(output_dir, exist_ok=True)

def remove_empty_columns(file_path, output_file_name):
    df = pd.read_csv(file_path, low_memory=False)
    
    df = df.dropna(axis=1, how='all')
    
    cleaned_file_path = os.path.join(output_dir, output_file_name)
    df.to_csv(cleaned_file_path, index=False)
    print(f"Successfully cleaned and saved: {cleaned_file_path}")

remove_empty_columns(guests_path, "Guests_by_Hotel_Type_by_Region_cleaned.csv")
remove_empty_columns(hotel_rooms_path, "Hotel_Establishments_and_Rooms_by_Rating_Type_cleaned.csv")
remove_empty_columns(hotel_indicators_path, "Hotel_Establishments_Main_Indicators_cleaned.csv")


Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Tourism\Guests_by_Hotel_Type_by_Region_cleaned.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Tourism\Hotel_Establishments_and_Rooms_by_Rating_Type_cleaned.csv
Successfully cleaned and saved: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Tourism\Hotel_Establishments_Main_Indicators_cleaned.csv


In [89]:
import pandas as pd
import os

metadata_path = r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Datasets Dubai DC 2025\Datasets\World Development Indicators\Metadata_World_Development_Indicator.csv"
wdi_path = r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Datasets Dubai DC 2025\Datasets\World Development Indicators\World_Development_Indicator.csv"

output_dir = r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\World Development Indicators"
os.makedirs(output_dir, exist_ok=True)

def clean_metadata(file_path, output_dir):
    df = pd.read_csv(file_path)
    
    if 'Unnamed: 4' in df.columns:
        df = df.drop(columns=['Unnamed: 4'])
    
    cleaned_file_path = os.path.join(output_dir, "Metadata_World_Development_Indicator_cleaned.csv")
    df.to_csv(cleaned_file_path, index=False)
    print(f"Metadata cleaned and saved at: {cleaned_file_path}")

def clean_wdi(file_path, output_dir):
    df = pd.read_csv(file_path, delimiter=',', skiprows=4)
    
    df_long = df.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
                      var_name="Year",
                      value_name="Value")
    
    df_long = df_long.dropna(subset=["Value"])
    
    df_long['Year'] = df_long['Year'].astype(int)
    
    cleaned_file_path = os.path.join(output_dir, "World_Development_Indicator_cleaned.csv")
    df_long.to_csv(cleaned_file_path, index=False)
    print(f"WDI cleaned and saved at: {cleaned_file_path}")

clean_metadata(metadata_path, output_dir)
clean_wdi(wdi_path, output_dir)


Metadata cleaned and saved at: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\World Development Indicators\Metadata_World_Development_Indicator_cleaned.csv
WDI cleaned and saved at: D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\World Development Indicators\World_Development_Indicator_cleaned.csv


# Data Cleaning and Preprocessing

During this stage of the challenge, we performed a comprehensive data cleaning and preprocessing workflow across multiple datasets covering various domains such as consumer price index, currency strength, gross domestic product, population, rents and transactions, tourism, and world development indicators. Below is a detailed explanation of the key actions taken:

## 1. Consumer Price Index (CPI)
- Columns with redundant or irrelevant information were removed, such as `OBS_STATUS` and `UNIT_MULT`, which did not add value to the analysis.
- Columns were renamed to more intuitive names (e.g., `TIME_PERIOD` → `Date`, `OBS_VALUE` → `CPI Value`) for better clarity.
- For datasets with `INDX` as a unit measure, index values were converted to percentages to ensure consistency with other datasets where values were already in percentages.

## 2. Currency Strength
- The column `Volume`, which was deemed irrelevant for the analysis, was removed.
- The `Date` column was converted to datetime format to enable proper time-series analysis.
- Missing values were checked, and it was confirmed that the remaining data was complete and ready for further analysis.

## 3. Gross Domestic Product (GDP)
- The datasets contained values in both millions of AED (`MILAED`) and percentages.
  - Values expressed in millions of AED were converted to percentages, ensuring consistency across all rows.
- Redundant columns such as `OBS_STATUS` and `OBS_COMMENT` were removed.
- The `TIME_PERIOD` column was retained as is, representing quarterly data without further transformation.

## 4. Population Data
- Two datasets were processed: `Population Estimates and Growth by Gender` and `Population Indicators`.
- Columns with 100% missing values, such as `DECIMALS`, were removed.
- All columns were renamed for clarity, and unnecessary metadata columns were dropped.
- The datasets were verified to ensure consistent row counts before and after cleaning.

## 5. Rents and Transactions

The following steps were performed to clean and process the `rents.csv` and `transactions.csv` datasets:

1. **Delimiter Separation:**
   - The original files had all the data stored in a single column with `;` as the delimiter.
   - We used a delimiter-based approach to split the data into multiple columns correctly and saved the cleaned files as `rents_cleaned.csv` and `transactions_cleaned.csv`.

2. **Removing Redundant Columns:**
   - In both datasets, the column **`Master Project`** was identified as redundant, with a high proportion of missing values.
   - We removed the `Master Project` column from both files to improve data quality and reduce noise.

3. **Overwriting the Cleaned Files:**
   - The cleaned files were saved by overwriting the previously cleaned versions (`rents_cleaned.csv` and `transactions_cleaned.csv`) to maintain a consistent directory structure and naming convention.

These steps ensured that the datasets were properly structured, with only relevant information retained, ready for further analysis.


## 6. Tourism Data
- Three datasets were cleaned: `Guests by Hotel Type by Region`, `Hotel Establishments and Rooms by Rating Type`, and `Hotel Establishments Main Indicators`.
- Columns with 100% missing values (`DECIMALS`, `OBS_STATUS`, `OBS_COMMENT`, `UNIT_MULT`) were removed from all datasets.
- The datasets were checked for duplicate rows, and no duplicates were found.
- Columns with a single unique value (e.g., `DATAFLOW`, `REF_AREA`, `FREQ`) were retained for metadata purposes but noted as potential candidates for exclusion in further analysis.

## 7. World Development Indicators
- **Metadata:**
  - The `Unnamed: 4` column, which contained 100% missing values, was removed.
  - The cleaned metadata file retained all relevant information about each indicator, such as `INDICATOR_CODE`, `INDICATOR_NAME`, `SOURCE_NOTE`, and `SOURCE_ORGANIZATION`.
- **World Development Indicator Data:**
  - The dataset was originally in a wide format with multiple years as columns. It was transformed into a long format using the `melt` function, resulting in two new columns:
    - `Year` (representing each year as a row).
    - `Value` (representing the corresponding indicator value for that year).
  - Missing values in the `Value` column were removed.
  - The final cleaned file was saved, ensuring that all relevant data was retained in a structured, analyzable format.

## General Verification and Integrity Checks
- After cleaning, each dataset was compared with its original version to ensure that:
  - **Only intended columns were removed** (i.e., columns with 100% missing values or irrelevant information).
  - **Row counts remained consistent**, confirming that no data was lost during the cleaning process.

In [5]:
import pandas as pd

files = {
    "Annual": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Consumer_Price_Index_Annually.csv",
    "Monthly": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Consumer_Price_Index_Monthly.csv",
    "Quarterly": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Consumer Price Index\Consumer_Price_Index_Quarterly.csv"
}

annual_df = pd.read_csv(files["Annual"])
monthly_df = pd.read_csv(files["Monthly"])
quarterly_df = pd.read_csv(files["Quarterly"])

annual_df['Date'] = pd.to_datetime(annual_df['Date'].astype(str) + '-01-01')

monthly_df['Date'] = pd.to_datetime(monthly_df['Date'], format='%Y-%m')

quarter_mapping = {'Q1': '-01-01', 'Q2': '-04-01', 'Q3': '-07-01', 'Q4': '-10-01'}
quarterly_df['Date'] = pd.to_datetime(quarterly_df['Date'].str.replace(r'(Q[1-4])', lambda x: quarter_mapping[x.group()], regex=True))

annual_df['CPI Value'].fillna(annual_df['CPI Value'].mean(), inplace=True)
monthly_df['CPI Value'].fillna(monthly_df['CPI Value'].mean(), inplace=True)

annual_df.to_csv(files["Annual"], index=False)
monthly_df.to_csv(files["Monthly"], index=False)
quarterly_df.to_csv(files["Quarterly"], index=False)

print("Modifications completed successfully.")


Modifications completed successfully.


In [13]:
import pandas as pd

file_path = r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Currency Strength\AED-USD_Cleaned.csv"
df = pd.read_csv(file_path)

df['Date'] = pd.to_datetime(df['Date'])

df['Return'].fillna(df['Return'].mean(), inplace=True)

print("Basic Info After Modifications:")
print(df.info())

print("\nRemaining Missing Values:")
print(df.isnull().sum())

df.to_csv(file_path, index=False)

print("\nModifications completed successfully.")


Basic Info After Modifications:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5468 entries, 0 to 5467
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Open       5468 non-null   float64       
 1   High       5468 non-null   float64       
 2   Low        5468 non-null   float64       
 3   Close      5468 non-null   float64       
 4   Adj Close  5468 non-null   float64       
 5   Date       5468 non-null   datetime64[ns]
 6   Return     5468 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 299.2 KB
None

Remaining Missing Values:
Open         0
High         0
Low          0
Close        0
Adj Close    0
Date         0
Return       0
dtype: int64

Modifications completed successfully.


In [17]:
import pandas as pd

files = {
    "GDP Quarterly Current Prices": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\GDP_Quarterly_Current_Prices_cleaned.csv",
    "GDP Quarterly Constant Prices": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\GDP_Quarterly_Constant_Prices_cleaned.csv"
}

def modify_gdp_file(file_path):
    df = pd.read_csv(file_path)
    
    df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['QUARTER'])
    
    df['GDP Value'].fillna(df['GDP Value'].mean(), inplace=True)
    
    print(f"Basic Info After Modifications for {file_path}:")
    print(df.info())
    
    df.to_csv(file_path, index=False)
    print(f"Modifications completed and saved for {file_path}.\n")

for name, path in files.items():
    modify_gdp_file(path)


Basic Info After Modifications for D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Gross Domestic Product\GDP_Quarterly_Current_Prices_cleaned.csv:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1920 entries, 0 to 1919
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATAFLOW       1920 non-null   object        
 1   REF_AREA       1920 non-null   object        
 2   FREQ           1920 non-null   object        
 3   UNIT_MEASURE   1920 non-null   object        
 4   SOURCE_DETAIL  1920 non-null   object        
 5   GDP Category   1920 non-null   object        
 6   QUARTER        1920 non-null   object        
 7   QGDP_SYS       1920 non-null   object        
 8   QGDP_UNIT      1920 non-null   object        
 9   Year           1920 non-null   int64         
 10  GDP Value      1920 non-null   float64       
 11  DECIMALS       1920 non-null   int64         
 12  

In [21]:
import pandas as pd

files = {
    "Population Estimates and Growth": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Population\Population_Estimates_and_Growth_by_Gender_cleaned.csv",
    "Population Indicators": r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Population\Population_Indicators_cleaned.csv"
}

def convert_year_to_date(file_path):
    df = pd.read_csv(file_path)
    
    df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-01-01')
    
    df.to_csv(file_path, index=False)
    print(f"Date conversion completed and saved for {file_path}.\n")
for name, path in files.items():
    convert_year_to_date(path)


Date conversion completed and saved for D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Population\Population_Estimates_and_Growth_by_Gender_cleaned.csv.

Date conversion completed and saved for D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Population\Population_Indicators_cleaned.csv.



In [33]:
import pandas as pd

rents_path = r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Rents & Transactions\rents_cleaned.csv"
transactions_path = r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Rents & Transactions\transactions_cleaned.csv"

rents_df = pd.read_csv(rents_path)

rents_df['Registration Date'] = pd.to_datetime(rents_df['Registration Date'], errors='coerce').dt.date
rents_df['Start Date'] = pd.to_datetime(rents_df['Start Date'], errors='coerce').dt.date
rents_df['End Date'] = pd.to_datetime(rents_df['End Date'], errors='coerce').dt.date

rents_df.to_csv(rents_path, index=False)

transactions_df = pd.read_csv(transactions_path)

transactions_df['Transaction Date'] = pd.to_datetime(transactions_df['Transaction Date'], errors='coerce').dt.date

transactions_df.to_csv(transactions_path, index=False)

print("Date columns converted to date only successfully.")


  transactions_df = pd.read_csv(transactions_path)


Date columns converted to date only successfully.


In [37]:
import pandas as pd

file_paths = [
    r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Tourism\Guests_by_Hotel_Type_by_Region_cleaned.csv",
    r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Tourism\Hotel_Establishments_and_Rooms_by_Rating_Type_cleaned.csv",
    r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\Tourism\Hotel_Establishments_Main_Indicators_cleaned.csv"
]

for file_path in file_paths:
    df = pd.read_csv(file_path)
    
    df['TIME_PERIOD'] = pd.to_datetime(df['TIME_PERIOD'], format='%Y')
    
    df.to_csv(file_path, index=False)
    
print("TIME_PERIOD conversion completed for all files.")


TIME_PERIOD conversion completed for all files.


In [45]:
import pandas as pd

file_paths = [
    r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\World Development Indicators\World_Development_Indicator_cleaned.csv",
    r"D:\Desights.ai\Dubai Real Estate Price Prediction Challenge\Cleaned Files\World Development Indicators\Metadata_World_Development_Indicator_cleaned.csv"
]

for file_path in file_paths:
    df = pd.read_csv(file_path)
    
    if 'Year' in df.columns:
        df['Year'] = pd.to_datetime(df['Year'], format='%Y')  # Convert to datetime (YYYY-01-01)
    
    df.to_csv(file_path, index=False)

print("Year column formatted to full datetime and saved in the same files.")


Year column formatted to full datetime and saved in the same files.


# Data Cleaning and Preprocessing – Continued

---

Building upon the initial data cleaning and preprocessing phase, we focused on standardizing and preparing additional datasets across key domains, including **Tourism**, **World Development Indicators**, **Population**, **Rents and Transactions**, and **GDP**. The main goal was to ensure consistent time formatting, appropriate data types, and structured datasets ready for analysis and modeling.

---

## Key Actions Taken

### 1. Date Standardization
- The `Year` columns across all relevant datasets were converted to a consistent **datetime format** (`YYYY-MM-DD`), ensuring uniformity in time-based analysis.
- Files where this change was applied:
  - **Tourism Data**:
    - `Guests by Hotel Type by Region`
    - `Hotel Establishments and Rooms by Rating Type`
    - `Hotel Establishments Main Indicators`
  - **World Development Indicators**
  - **Population Data**
  - **GDP Data`

### 2. Data Type Consistency
- In the **Rents** and **Transactions** datasets, key columns such as `Contract Amount`, `Annual Amount`, `Property Size`, `Room(s)`, and `Parking` were correctly converted to numerical types for proper aggregation and statistical operations.
- The **World Development Indicator Data** was also reviewed to ensure that numerical columns were appropriately typed.

### 3. Handling Missing Values
- While no further missing values were imputed during this phase, columns with a significant number of missing values were flagged for future imputation or exclusion, depending on their relevance to the analysis.
- For example, the `DECIMALS` column in **Tourism Data** was retained but noted for possible exclusion during analysis due to its low information value.

### 4. File Overwriting
- All cleaned datasets were saved by overwriting their previous versions to maintain a clear and consistent file structure.
- This approach ensured that the directory contains only up-to-date, cleaned files ready for analysis.

---

## Outcome
Through these additional cleaning steps, we achieved:
- **Time consistency across datasets**, enabling seamless merging and time-series analysis.
- **Improved data integrity** by ensuring correct data types and handling missing values where necessary.
- **Readiness for EDA and modeling**, with datasets now in a structured and analyzable format.
