In [1]:
import pandas as pd
import numpy as np

In [2]:
path = "../../data/dataset_raw.csv"
df = pd.read_csv(path) 

In [3]:
def average_range(range_str: str) -> float:
    """
    Calculates the average of a range string (e.g., '10-20' or '10–20').
    This function handles both the standard hyphen (-) and the en-dash (–) 
    found in the provided data.
    """
    if pd.isna(range_str):
        return np.nan # Handle NaN values if they appear in the dataset
        
    # Determine the delimiter used (en-dash '–' or hyphen '-')
    if '–' in range_str:
        delimiter = '–'
    elif '-' in range_str:
        delimiter = '-'
    else:
        # If no range is found, assume it's a single value and return it as a float
        try:
            return float(range_str.strip())
        except ValueError:
            # Return NaN if the string cannot be converted to a number
            return np.nan
    
    try:
        # Split the string, map both parts to float, and calculate the average
        lower_str, upper_str = range_str.split(delimiter)
        lower = float(lower_str.strip())
        upper = float(upper_str.strip())
        return (lower + upper) / 2
    except ValueError:
        # Catch errors if the split parts cannot be converted to float
        return np.nan

In [4]:
df.head()

Unnamed: 0,Name,Age,Gender,Zone_of_Residence,Reason_of_low_attention_span,Screentime_mins,Attention_span_mins
0,Abrar,17,M,Motijheel,Lack of Sleep & Poor Health,120–240,3–5
1,Kanchon,16,M,Mohammadpur,Lack of Sleep & Poor Health,120–240,15–30
2,Shafi,17,M,Tejgaon,Lack of Sleep & Poor Health,120–240,30–60
3,Zunyaed,18,M,Farmgate,Stress & Anxiety,30–120,1–2
4,Jabeer,17,M,Shahbagh,Family Problems,120–240,30–60


In [5]:
df['Screentime_mins'] = df['Screentime_mins'].apply(average_range)
df['Attention_span_mins'] = df['Attention_span_mins'].apply(average_range)

df.head()

Unnamed: 0,Name,Age,Gender,Zone_of_Residence,Reason_of_low_attention_span,Screentime_mins,Attention_span_mins
0,Abrar,17,M,Motijheel,Lack of Sleep & Poor Health,180.0,4.0
1,Kanchon,16,M,Mohammadpur,Lack of Sleep & Poor Health,180.0,22.5
2,Shafi,17,M,Tejgaon,Lack of Sleep & Poor Health,180.0,45.0
3,Zunyaed,18,M,Farmgate,Stress & Anxiety,75.0,1.5
4,Jabeer,17,M,Shahbagh,Family Problems,180.0,45.0


In [6]:
df['Gender'] = df['Gender'].str.upper()
df.tail()

Unnamed: 0,Name,Age,Gender,Zone_of_Residence,Reason_of_low_attention_span,Screentime_mins,Attention_span_mins
892,Shirin,30,M,Farmgate,Multitasking,75.0,45.0
893,Nayeem,27,M,Gulistan,Family Problems,180.0,4.0
894,Priya,19,F,Mohammadpur,Multitasking,180.0,1.5
895,Arko,32,M,Shahbagh,Smartphones & Social Media,75.0,45.0
896,Sumaiya,24,F,Motijheel,Stress & Anxiety,180.0,22.5


In [7]:
df.to_csv("../../data/dataset.csv")