Step 1: Generate and load the dummy data 

In [1]:
import pandas as pd
import numpy as np

# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


Step 2: Load the preprocessing tool 

In [12]:
from scipy import stats
from sklearn.discriminant_analysis import StandardScaler

def load_data(df):
    return df

def handle_missing_values(df):
    df = df.copy()
    # Fill numeric columns with mean
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(df[col].mean())
    # Fill non-numeric columns with mode
    for col in df.select_dtypes(exclude=[np.number]).columns:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    return df
def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    return df[(z_scores < 3).all(axis=1)]  # Remove rows with any outliers

def scale_data(df):
    scaler = StandardScaler()
    df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))
    return df

def encode_categorical(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns)

def save_data(df, output_filepath):
    df.to_csv(output_filepath, index=False)

Step 3: Preprocess the dummy data

In [13]:
# Load the data
df_preprocessed = load_data(df_dummy)

# Handle missing values
df_preprocessed = handle_missing_values(df_preprocessed)

# Remove outliers
df_preprocessed = remove_outliers(df_preprocessed)

# Scale the data
df_preprocessed = scale_data(df_preprocessed)

# Encode categorical variables
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Display the preprocessed data
print(df_preprocessed.head())

   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698298 -0.519379  0.932936        True       False       False   
1  0.338384  0.887380  0.932936       False        True       False   
2  0.915276  1.442679 -1.071884       False       False        True   
3  2.173747 -0.556399  0.932936       False       False       False   
4  1.801501 -1.222759 -1.071884        True       False       False   

   Category_D  
0       False  
1       False  
2       False  
3        True  
4       False  


Step 4: Save the preprocessed data 

In [14]:
# Save the cleaned and preprocessed DataFrame to a CSV file
save_data(df_preprocessed, 'preprocessed_dummy_data.csv')

print('Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv')

Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv


Verifying the preprocessing steps 

In [15]:
# Check for missing values: 
print(df_preprocessed.isnull().sum())

# Verify outlier removal:
print(df_preprocessed.describe())
print(df_preprocessed.head())
print(df_preprocessed.columns)

Feature1      0
Feature2      0
Target        0
Category_A    0
Category_B    0
Category_C    0
Category_D    0
dtype: int64
           Feature1      Feature2        Target
count  1.010000e+02  1.010000e+02  1.010000e+02
mean  -2.544032e-15 -3.407615e-17 -2.418308e-17
std    1.004988e+00  1.004988e+00  1.004988e+00
min   -2.606142e+00 -1.704018e+00 -1.071884e+00
25%   -6.930755e-01 -6.674590e-01 -1.071884e+00
50%    6.071482e-02 -1.861994e-01  9.329364e-01
75%    6.663572e-01  8.503597e-01  9.329364e-01
max    2.202524e+00  1.886919e+00  9.329364e-01
   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698298 -0.519379  0.932936        True       False       False   
1  0.338384  0.887380  0.932936       False        True       False   
2  0.915276  1.442679 -1.071884       False       False        True   
3  2.173747 -0.556399  0.932936       False       False       False   
4  1.801501 -1.222759 -1.071884        True       False       False   

   Category_D  
