In [9]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
def load_data(filename):
    # Load the raw data from a CSV file
    data = pd.read_csv('../data/raw' + filename)
    return data

def clean_data(data):
    # Remove any rows with missing values
    data = data.dropna()
    data = data.drop(['Unnamed: 0'],axis=1)
    return data

def encode_categorical(data):

    # Encode categorical variables as binary features
    encoder = OneHotEncoder()
    enc_df = pd.DataFrame(encoder.fit_transform(data[['CellName']]).toarray())

    # merge with main df bridge_df on key values
    data = data.join(enc_df)
    return data

def scale_numerical(data):
    
    # split Time columns into hour and minute formate
    data[['Hour', 'Minute']] = data['Time'].str.split(':', 1, expand=True)
    data.drop(['Time'],axis=1,inplace=True)
    data['Hour'] = pd.to_numeric(data['Hour'],errors='coerce')
    data['Minute'] = pd.to_numeric(data['Minute'], errors='coerce')
    data['maxUE_UL+DL'] = pd.to_numeric(data['maxUE_UL+DL'],errors='coerce')

    # Scale numerical features to have zero mean and unit variance
    numeric_features = data.drop(['CellName','Unusual'],axis =1).columns.to_list()
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data[numeric_features])
    data[numeric_features] = scaled
    return data
#return instead of save 

def save_data(data, filename):
    
    # Save the preprocessed data to a CSV file
    data.to_csv(filename, index=False)
    return data
if __name__ == '__main__':
    # Load the raw data
    data = load_data('/train.csv')

    # Clean the data
    data = clean_data(data)
    
    # Scale numerical features
    data = scale_numerical(data)

    # Encode categorical variables
    data = encode_categorical(data)

    # Getting the numerical data for model
    data = data.select_dtypes(['number'])




   PRBUsageUL  PRBUsageDL  meanThr_DL  meanThr_UL  maxThr_DL  maxThr_UL  \
0   -0.508514   -0.600057   -0.578180   -0.312958  -0.859869  -0.336991   
1    0.890905   -0.126799   -0.143387   -0.164438   4.092402  -0.247529   
2   -0.532630   -0.552778   -0.156051   -0.204424   0.851624  -0.189947   
3   -0.605096    1.434343    0.319549    0.098328   0.125968  -0.078568   
4   -0.629212    1.623459    1.937714    0.275410   1.261197  -0.109451   

   meanUE_DL  meanUE_UL  maxUE_DL  maxUE_UL  ...   23   24   25   26   27  \
0  -0.602636   0.663458 -0.669510 -0.765168  ...  0.0  0.0  0.0  0.0  0.0   
1  -0.016615   0.833517  0.457255  0.676191  ...  0.0  0.0  0.0  0.0  0.0   
2  -0.505773  -1.225877 -0.669510 -0.765168  ...  0.0  0.0  0.0  0.0  0.0   
3   0.961702  -1.225877  1.020637  0.676191  ...  0.0  0.0  0.0  0.0  1.0   
4   1.305565  -1.225877  1.020637  0.676191  ...  0.0  0.0  0.0  0.0  0.0   

    28   29   30   31   32  
0  0.0  0.0  0.0  1.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  