<a href="https://colab.research.google.com/github/nithinrk11/FlowCast/blob/main/Test_dataset_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Test dataset generator

##The below code cell generates 3 types of datasets:
>* `noisy_crowd_data2.csv`
* `test_data.csv`
* `data_no_lag.csv`
###The `noisy_crowd_data2.csv` file consists of dataset with all major details such as crowd type, lag features, noisy crowd count, etc.

###The `test_data.csv` file is modified dataset of the noisy_crowd_count_data2 file where we have dropped the crowd type column.

###The `data_no_lag.csv` is another modified file that does not include the lag features and crowd type columns.

In [None]:
import pandas as pd
import numpy as np
import random

# Set a random seed for reproducibility
random.seed(42)

# Define the time range for the dataset (e.g., hourly data for a year)
start_date = pd.to_datetime('2023-01-01 09:00:00')
end_date = pd.to_datetime('2023-12-31 23:00:00')
date_range = pd.date_range(start=start_date, end=end_date, freq='H')  # Hourly data

# Create a DataFrame to store the synthetic data
data = pd.DataFrame({'Timestamp': date_range})

# Generate synthetic crowd count and categorize crowd type
def generate_crowd_data(hour):
    # Simulate different crowd patterns
    if 9 <= hour < 12:
        crowd_count = random.randint(10, 50)  # Morning - low crowd
    elif 12 <= hour < 15:
        crowd_count = random.randint(50, 100)  # Afternoon - moderate crowd
    elif 15 <= hour < 18:
        crowd_count = random.randint(100, 150)  # Evening - high crowd
    else:
        crowd_count = random.randint(20, 80)  # Night - lower crowd

    # Categorize crowd type
    if crowd_count <= 30:
        crowd_type = "Low Crowd"
    elif 30 < crowd_count <= 70:
        crowd_type = "Moderate Crowd"
    else:
        crowd_type = "High Crowd"

    return crowd_count, crowd_type

data['Crowd_Count'], data['Crowd_Type'] = zip(*[generate_crowd_data(hour) for hour in data['Timestamp'].dt.hour])

# Extract day, date, day name, and month name information
data['Day'] = data['Timestamp'].dt.day
data['Date'] = data['Timestamp'].dt.date
data['Day_Name'] = data['Timestamp'].dt.strftime('%A')  # Get day name
data['Month_Name'] = data['Timestamp'].dt.strftime('%B')  # Get month name
#--------------------------------------------------------------------------------#

from sklearn.preprocessing import LabelEncoder


# Encode the Crowd_Type column
label_encoder = LabelEncoder()
data['Crowd_Type_Label'] = label_encoder.fit_transform(data['Crowd_Type'])

# Create lag features for Crowd_Count and Timestamp
num_lags = 3  # You can adjust the number of lag hours as needed
for lag in range(1, num_lags + 1):
    data[f'Prev_Crowd_Count_{lag}'] = data['Crowd_Count'].shift(lag)
    data[f'Prev_Timestamp_{lag}'] = data['Timestamp'].shift(lag)

# Drop rows with missing values (due to lag features)
data.dropna(inplace=True)

# Now you have a supervised dataset with input features and target labels
# The input features include the lagged Crowd_Count and Timestamp columns
# The target label is Crowd_Type_Label
#--------------------------------------------------------------------------------#


# Introduce random noise to the 'Crowd_Count' feature
np.random.seed(42)  # Set a seed for reproducibility
noise = np.random.normal(loc=0, scale=2, size=len(data))  # Adjust the scale as needed
data['Noisy_Crowd_Count'] = data['Crowd_Count'] + noise

# Save the dataset with noisy crowd count to a new CSV file
data.to_csv('noisy_crowd_data2.csv', index=False)
#--------------------------------------------------------------------------------#


# Load the noisy crowd data CSV file
data = pd.read_csv('noisy_crowd_data2.csv')

# Drop the 'Crowd_Type' column
data_test = data.drop(columns=['Crowd_Type', 'Crowd_Type_Label'])

# Save the test dataset to a new CSV file
data_test.to_csv('test_data.csv', index=False)
#--------------------------------------------------------------------------------#

# Drop the lag features
lag_columns = [col for col in data.columns if 'Prev_' in col]
data_no_lag = data.drop(columns=lag_columns)

# Save the dataset without lag features to a new CSV file
data_no_lag.to_csv('data_no_lag.csv', index=False)