# Data Cleaning Notebook
This notebook is used to clean the datasets for global temperature anomalies and related information.

In [1]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np

### Step 2: Load the Datasets
Load all the raw data files that need cleaning.

In [2]:
# Load datasets
global_monthly = pd.read_csv("../data/raw/Globalmonthlyandseasonal.csv")
northern_hemisphere = pd.read_csv("../data/raw/NorthernHemisphere.csv")
southern_hemisphere = pd.read_csv("../data/raw/SouthernHemisphere.csv")
zone_annual = pd.read_csv("../data/raw/ZoneAnnual.csv")

### Step 3: Handle Missing Values
Replace placeholder values such as '***' with `NaN`.

In [3]:
# Handle missing values by replacing '***' with NaN
datasets = [global_monthly, northern_hemisphere, southern_hemisphere, zone_annual]
for dataset in datasets:
    dataset.replace("***", np.nan, inplace=True)

### Step 4: Ensure Correct Data Types
Ensure the `Year` column is of integer type and check the data types of other columns.

In [4]:
# Convert the 'Year' column to integer type for all datasets
for dataset in datasets:
    dataset['Year'] = dataset['Year'].astype(int)

# Check data types
global_monthly.info()

### Step 5: Handle Duplicates
Remove any duplicate rows in the datasets.

In [5]:
# Drop duplicate rows
for dataset in datasets:
    dataset.drop_duplicates(inplace=True)

### Step 6: Validate Year Ranges
Remove rows where the `Year` value is outside the expected range (1880-2023).

In [6]:
# Keep only rows where 'Year' is between 1880 and 2023
for dataset in datasets:
    dataset = dataset[(dataset['Year'] >= 1880) & (dataset['Year'] <= 2023)]

### Step 7: Fill Missing Values
For missing temperature data, we can fill in the gaps using the column means.

In [7]:
# Fill missing values with column means
for dataset in datasets:
    dataset.fillna(dataset.mean(), inplace=True)

### Step 8: Save Cleaned Datasets
Save the cleaned datasets to new CSV files.

In [8]:
# Save cleaned data
global_monthly.to_csv("../data/cleaned/cleaned_globalmonthlyandseasonal.csv", index=False)
northern_hemisphere.to_csv("../data/cleaned/cleaned_northernhemisphere.csv", index=False)
southern_hemisphere.to_csv("../data/cleaned/cleaned_southernhemisphere.csv", index=False)
zone_annual.to_csv("../data/cleaned/cleaned_zoneannual.csv", index=False)