In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
os.chdir('..')
from src.loader import DataLoader

loader = DataLoader()

table_name = 'xdr_data'
df = loader.load_data(table_name)


In [3]:
def analyze_data(df):
    print("Shape of the data:", df.shape)
    print("\nColumn info:")
    print(df.info())
    print("\nColumn head:")
    print(df.head())
    print("\nColumn description:")
    print(df.describe())
    print("\nNumber of duplicate rows:", df.duplicated().sum())

analyze_data(df)


Shape of the data: (150001, 55)

Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        1488

In [10]:
def handle_missing_values(df):
    numeric_missing = df.select_dtypes(include='number').isna().sum()
    print("\nNumeric columns with missing values:")
    print(numeric_missing)

    numeric_imputer = SimpleImputer(strategy='mean')
    df[df.select_dtypes(include='number').columns] = numeric_imputer.fit_transform(df.select_dtypes(include='number'))

    non_numeric_missing = df.select_dtypes(exclude='number').isna().sum()
    print("\nNon-numeric columns with missing values:")
    print(non_numeric_missing)

    df[df.select_dtypes(exclude='number').columns] = df.select_dtypes(exclude='number').fillna('Unknown')
    return df
print(df.duplicated().sum())


0


In [5]:
def split_data(df, target_column, test_size=0.2, random_state=42):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [9]:
def store_preprocessed_data(df, table_name, connection_string):
    engine = create_engine(connection_string)
    df.to_sql(table_name, engine, if_exists='replace', index=False)
