In [1]:
# Synthetic Data Generation for Chennai Groundwater Project
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducible results
np.random.seed(42)

# Create date range (2013-2023, monthly data)
dates = pd.date_range(start='2013-01-01', end='2023-12-31', freq='M')
n_periods = len(dates)

# Generate synthetic GROUNDWATER LEVEL (target variable)
# Creating a declining trend with seasonal patterns
base_level = 7.0  # Starting groundwater level (meters below surface)
decline_rate = 0.03  # Long-term decline per month (serious depletion)
seasonal_effect = 1.2 * np.sin(2 * np.pi * np.arange(n_periods) / 12)  # Seasonal variation
random_noise = np.random.normal(0, 0.3, n_periods)  # Random fluctuations

# Combine all components
groundwater_level = base_level + (decline_rate * np.arange(n_periods)) + seasonal_effect + random_noise

# Generate synthetic RAINFALL data (primary feature)
# Chennai has distinct wet and dry seasons
monsoon_months = [6, 7, 8, 9, 10, 11]  # June to November
rainfall = np.zeros(n_periods)

for i, date in enumerate(dates):
    if date.month in monsoon_months:
        # Higher rainfall during monsoon months
        rainfall[i] = np.random.normal(120, 30)
    else:
        # Lower rainfall during dry months
        rainfall[i] = np.random.normal(40, 15)

# Ensure no negative rainfall values
rainfall = np.clip(rainfall, 0, None)

# Generate synthetic CROP AREA data (secondary feature)
# Seasonal pattern with some long-term changes
base_crop_area = 18000  # Base hectare value
crop_seasonality = 2000 * np.sin(2 * np.pi * np.arange(n_periods) / 12 + np.pi/4)
crop_trend = 50 * np.sin(2 * np.pi * np.arange(n_periods) / 60)  # Multi-year cycle
crop_noise = np.random.normal(0, 300, n_periods)

crop_area = base_crop_area + crop_seasonality + crop_trend + crop_noise
crop_area = np.clip(crop_area, 15000, 21000)  # Reasonable bounds

# Create DataFrame
chennai_water_data = pd.DataFrame({
    'Date': dates,
    'Groundwater_Level_m': np.round(groundwater_level, 2),
    'Rainfall_mm': np.round(rainfall, 1),
    'Crop_Area_hectares': np.round(crop_area, 0)
})

# Add derived time-based features
chennai_water_data['Year'] = chennai_water_data['Date'].dt.year
chennai_water_data['Month'] = chennai_water_data['Date'].dt.month
chennai_water_data['Season'] = chennai_water_data['Month'].apply(
    lambda x: 'Monsoon' if x in [6,7,8,9,10,11] else 'Dry'
)

# Save to CSV
chennai_water_data.to_csv('chennai_groundwater_data.csv', index=False)

print("✅ Synthetic Chennai groundwater data created successfully!")
print(f"📁 File saved as 'chennai_groundwater_data.csv'")
print(f"📊 Dataset shape: {chennai_water_data.shape}")
print("\nFirst 5 rows:")
print(chennai_water_data.head())
print("\nDataset info:")
print(chennai_water_data.info())

Matplotlib is building the font cache; this may take a moment.


✅ Synthetic Chennai groundwater data created successfully!
📁 File saved as 'chennai_groundwater_data.csv'
📊 Dataset shape: (132, 7)

First 5 rows:
        Date  Groundwater_Level_m  Rainfall_mm  Crop_Area_hectares  Year  \
0 2013-01-31                 7.15         24.1             19338.0  2013   
1 2013-02-28                 7.59         47.1             19563.0  2013   
2 2013-03-31                 8.29         26.2             20432.0  2013   
3 2013-04-30                 8.75         63.2             19001.0  2013   
4 2013-05-31                 8.09         28.3             18406.0  2013   

   Month Season  
0      1    Dry  
1      2    Dry  
2      3    Dry  
3      4    Dry  
4      5    Dry  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 132 non-null    datetime64[ns

In [1]:
import os
print(os.getcwd())

/home/9f100862-c38b-491b-89c3-4b7d7870d765


In [2]:
print(os.listdir())

['.jupyter', '.bashrc', 'chennai_groundwater_data.csv', '.config', 'anaconda_projects', '.npm', '.ipython', 'README.ipynb', '.pythonstartup.py', '.local', '.profile', '.virtualenvs', '.anaconda', '.ipynb_checkpoints', '.gitconfig', '.cache', 'Groundwater_Chennai_Project.ipynb', '.vimrc']


In [3]:
import pandas as pd

df = pd.read_csv('chennai_groundwater_data.csv')
print(df.head())
print(df.info())


         Date  Groundwater_Level_m  Rainfall_mm  Crop_Area_hectares  Year  \
0  2013-01-31                 7.15         24.1             19338.0  2013   
1  2013-02-28                 7.59         47.1             19563.0  2013   
2  2013-03-31                 8.29         26.2             20432.0  2013   
3  2013-04-30                 8.75         63.2             19001.0  2013   
4  2013-05-31                 8.09         28.3             18406.0  2013   

   Month Season  
0      1    Dry  
1      2    Dry  
2      3    Dry  
3      4    Dry  
4      5    Dry  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 132 non-null    object 
 1   Groundwater_Level_m  132 non-null    float64
 2   Rainfall_mm          132 non-null    float64
 3   Crop_Area_hectares   132 non-null    float64
 4   Year                