In [1]:
import pandas as pd
import numpy as np


In [2]:

# Load the dataset
df = pd.read_csv('data.csv')  # Update path to your file

# --------------------------
# 1. Initial Data Inspection
# --------------------------
print("Initial Data Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# --------------------------
# 2. Null Value Handling
# --------------------------
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())

# Drop rows with any null values (alternative: impute)
df_cleaned = df.dropna()

# --------------------------
# 3. Duplicate Handling
# --------------------------
duplicates = df_cleaned.duplicated().sum()
print(f"\nFound {duplicates} duplicate rows")
df_cleaned = df_cleaned.drop_duplicates()

# --------------------------
# 4. Range Validation
# --------------------------
# Define natural ranges for numerical columns
range_checks = {
    'Temperature': (-50, 60),
    'Humidity': (0, 100),
    'Wind Speed': (0, 200),
    'Precipitation (%)': (0, 100),
    'Atmospheric Pressure': (870, 1085),
    'UV Index': (0, 11),
    'Visibility (km)': (0, 20)
}

# Check for out-of-range values
outliers_report = {}
for col, (min_val, max_val) in range_checks.items():
    outliers = df_cleaned[(df_cleaned[col] < min_val) | (df_cleaned[col] > max_val)]
    outlier_count = outliers.shape[0]
    outliers_report[col] = outlier_count
    print(f"\n{col} outliers ({min_val}-{max_val}): {outlier_count} rows")
    if outlier_count > 0:
        print(f"Sample outliers:\n{outliers.sample(min(3, outlier_count))[col]}")

# Filter out rows with out-of-range values
for col, (min_val, max_val) in range_checks.items():
    df_cleaned = df_cleaned[(df_cleaned[col] >= min_val) & (df_cleaned[col] <= max_val)]

# --------------------------
# 5. Categorical Value Checks
# --------------------------
# Cloud Cover Validation
print("\nCloud Cover unique values:", df_cleaned['Cloud Cover'].unique())

# Weather Type Validation
valid_weather_types = ['Rainy', 'Cloudy', 'Sunny', 'Snowy']  # Update as per actual categories
invalid_weather = df_cleaned[~df_cleaned['Weather Type'].isin(valid_weather_types)]
print(f"\nInvalid Weather Types: {invalid_weather.shape[0]} rows")
df_cleaned = df_cleaned[df_cleaned['Weather Type'].isin(valid_weather_types)]

# --------------------------
# 6. Final Data Summary
# --------------------------
print("\nCleaning Report:")
print(f"- Original rows: {df.shape[0]}")
print(f"- Remaining rows: {df_cleaned.shape[0]}")
print(f"- Rows removed: {df.shape[0] - df_cleaned.shape[0]}")
print("\nMissing Values After Cleaning:")

print(df_cleaned.isnull().sum())

# --------------------------
# 7. Save Cleaned Data
# --------------------------
df_cleaned.to_csv('cleaned_weather_data.csv', index=False)
print("\nCleaned data saved to cleaned_weather_data.csv")

Initial Data Shape: (13200, 11)

First 5 rows:
   Temperature  Humidity  Wind Speed  Precipitation (%)    Cloud Cover  \
0         14.0        73         9.5               82.0  partly cloudy   
1         39.0        96         8.5               71.0  partly cloudy   
2         30.0        64         7.0               16.0          clear   
3         38.0        83         1.5               82.0          clear   
4         27.0        74        17.0               66.0       overcast   

   Atmospheric Pressure  UV Index  Season  Visibility (km)  Location  \
0               1010.82         2  Winter              3.5    inland   
1               1011.43         7  Spring             10.0    inland   
2               1018.72         5  Spring              5.5  mountain   
3               1026.25         7  Spring              1.0   coastal   
4                990.67         1  Winter              2.5  mountain   

  Weather Type  
0        Rainy  
1       Cloudy  
2        Sunny  
3      