In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
def load_data(filename):
    # Load the raw data from a CSV file
    data = pd.read_csv('../data/raw' + filename)
    return data

def clean_data(data):
    # Remove any rows with missing values
    data = data.dropna()
    data = data.drop(['Unnamed: 0'],axis=1)
    return data

def encode_categorical(data):

    # Encode categorical variables as binary features
    encoder = OneHotEncoder()
    enc_df = pd.DataFrame(encoder.fit_transform(data[['CellName']]).toarray())

    # merge with main df bridge_df on key values
    data = data.join(enc_df)
    return data

def scale_numerical(data):
    
    # split Time columns into hour and minute formate
    data[['Hour', 'Minute']] = data['Time'].str.split(':', 1, expand=True)
    data.drop(['Time'],axis=1,inplace=True)
    data['Hour'] = pd.to_numeric(data['Hour'],errors='coerce')
    data['Minute'] = pd.to_numeric(data['Minute'], errors='coerce')
    data['maxUE_UL+DL'] = pd.to_numeric(data['maxUE_UL+DL'],errors='coerce')

    # Scale numerical features to have zero mean and unit variance
    numeric_features = data.drop(['CellName'],axis =1).columns.to_list()
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data[numeric_features])
    data[numeric_features] = scaled
    return data
#return instead of save 

def save_data(data):
    
    # Load the raw data
    # data = load_data('/train.csv')

    # Clean the data
    data = clean_data(data)
    
    # Scale numerical features
    data = scale_numerical(data)

    # Encode categorical variables
    data = encode_categorical(data)

    # Getting the numerical data for model
    data = data.select_dtypes(['number'])

    # Save the preprocessed data to a CSV file
    # data.to_csv(filename, index=False)
    return data

