## Script for cleaning the dataset

Good practices in data analytics state that the best way of managing this dataset is pivoting it. 

The final dataset will contain 3 columns, one with date, one with total_investment and one with region. 

In [None]:
## importing libraries

import pandas as pd
import numpy as np
from openpyxl import load_workbook

## importing data

data = pd.read_excel('data.xlsx')
print("Data imported successfully")
print(data.head())


In [None]:
## let's transform the dataset into the wide format so it is better to work with.

# Melt the dataframe to convert from wide to long format
# This will create year, region, and total_investment columns
data_cleaned = data.melt(
    id_vars=['Region'],
    var_name='year',
    value_name='total_investment'
)

# Rename 'Region' to 'region' for consistency
data_cleaned  = data_cleaned.rename(columns={'Region': 'region'})

# Convert year to integer (it might be read as string)
data_cleaned['year'] = data_cleaned['year'].astype(int)

# Sort by region and year for better readability
data_cleaned = data_cleaned.sort_values(['region', 'year']).reset_index(drop=True)

# Display the transformed dataset
print("Transformed dataset shape:", data_cleaned.shape)
print("\nFirst few rows:")
print(data_cleaned.head(10))
print("\nDataset info:")
print(data_cleaned.info())


In [None]:
## Saving the cleaned data

# Save as CSV (most common format, easy to share and use in other tools)
data_cleaned.to_csv('data_cleaned.csv', index=False)
print("âœ“ Data saved as 'data_cleaned.csv'")

# Display confirmation
print(f"\nSaved {len(data_cleaned)} rows and {len(data_cleaned.columns)} columns")
print(f"Columns: {', '.join(data_cleaned.columns.tolist())}")