In [1]:
import pandas as pd
import numpy as np
import data_clean_functions as dc
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# guarantee we always generate the same random numbers
np.random.seed(0)

# Create a dummy dataset
dummy_data = {
    # Normally distributed with an outlier in the end
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # 102 Random integers
    # Categorical with some missing values
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

In [2]:
# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

In [3]:
# Check the dataset
print(df_dummy.head())
print(df_dummy.describe())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0
         Feature1    Feature2      Target
count  101.000000  102.000000  102.000000
mean   101.582258   45.754902    0.529412
std     14.121324   27.154649    0.501599
min     74.470102    0.000000    0.000000
25%     93.656779   27.250000    0.000000
50%    101.216750   40.500000    1.000000
75%    107.610377   69.000000    1.000000
max    200.000000   97.000000    1.000000


In [4]:
# Save the cleaned and preprocessed DataFrame to a CSV file just to have it:
dc.save_data(df_dummy, 'preprocessed_dummy_data.csv')

In [5]:
# preprocessing the data with the functions we've written in the data_clean_functions file:
# Load the data
df_preprocessed = dc.load_data("preprocessed_dummy_data.csv")
print(df_preprocessed.describe())
print(df_preprocessed.head())

         Feature1    Feature2      Target
count  101.000000  102.000000  102.000000
mean   101.582258   45.754902    0.529412
std     14.121324   27.154649    0.501599
min     74.470102    0.000000    0.000000
25%     93.656779   27.250000    0.000000
50%    101.216750   40.500000    1.000000
75%    107.610377   69.000000    1.000000
max    200.000000   97.000000    1.000000
     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [6]:
# Handle missing values: removing NAs since it's only 1 row
df_preprocessed = df_preprocessed.dropna()
print(df_preprocessed.describe())
print(df_preprocessed.head())

         Feature1    Feature2      Target
count  101.000000  101.000000  101.000000
mean   101.582258   45.881188    0.524752
std     14.121324   27.259966    0.501878
min     74.470102    0.000000    0.000000
25%     93.656779   27.000000    0.000000
50%    101.216750   41.000000    1.000000
75%    107.610377   69.000000    1.000000
max    200.000000   97.000000    1.000000
     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [7]:
# Remove outliers: removing since it seems to be an error
df_preprocessed = dc.remove_outliers(df_preprocessed, 3)

print(df_preprocessed.describe())
print(df_preprocessed.head())

can't process column "Category", skipping
         Feature1    Feature2      Target
count  100.000000  100.000000  100.000000
mean   100.598080   46.160000    0.530000
std     10.129598   27.252178    0.501614
min     74.470102    0.000000    0.000000
25%     93.561433   27.750000    0.000000
50%    100.940961   41.000000    1.000000
75%    107.370774   69.250000    1.000000
max    122.697546   97.000000    1.000000
     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [8]:
# caping a column with the function to do that:
df_preprocessed = dc.cap_column_quantile(df_preprocessed, "Feature1", 0.97)
print(df_preprocessed.describe())
print(df_preprocessed.head())

         Feature1    Feature2      Target
count  100.000000  100.000000  100.000000
mean   100.521199   46.160000    0.530000
std      9.974297   27.252178    0.501614
min     74.470102    0.000000    0.000000
25%     93.561433   27.750000    0.000000
50%    100.940961   41.000000    1.000000
75%    107.370774   69.250000    1.000000
max    118.975358   97.000000    1.000000
     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  118.975358        31        D       1
4  118.675580        13        A       0


In [9]:
df_preprocessed = dc.scale_data(df_preprocessed)

In [10]:
print(df_preprocessed)

      Feature1  Feature2 Category  Target  scaled-Feature1  scaled-Feature2  \
0   117.640523        32        A       1         1.724991        -0.522209   
1   104.001572        70        B       1         0.350692         0.879200   
2   109.787380        85        C       0         0.933686         1.432387   
3   118.975358        31        D       1         1.859492        -0.559088   
4   118.675580        13        A       0         1.829286        -1.222913   
..         ...       ...      ...     ...              ...              ...   
95  107.065732        25        D       0         0.659445        -0.780363   
96  100.105000        67        A       0        -0.041937         0.768562   
97  117.858705        35        B       1         1.746975        -0.411572   
98  101.269121        30        C       1         0.075363        -0.595967   
99  104.019894        29        D       1         0.352538        -0.632847   

    scaled-Target  
0        0.941697  
1        0.

In [11]:
# Convert categorical variables into a numerical format that machine learning algorithms can process
df_processed = dc.encode_categorical(
    df_preprocessed, categorical_columns=["Category"])
print(df_processed.head())
print(df_processed.describe())

     Feature1  Feature2  Target  scaled-Feature1  scaled-Feature2  \
0  117.640523        32       1         1.724991        -0.522209   
1  104.001572        70       1         0.350692         0.879200   
2  109.787380        85       0         0.933686         1.432387   
3  118.975358        31       1         1.859492        -0.559088   
4  118.675580        13       0         1.829286        -1.222913   

   scaled-Target  Category_A  Category_B  Category_C  Category_D  
0       0.941697        True       False       False       False  
1       0.941697       False        True       False       False  
2      -1.061913       False       False        True       False  
3       0.941697       False       False       False        True  
4      -1.061913        True       False       False       False  
         Feature1    Feature2      Target  scaled-Feature1  scaled-Feature2  \
count  100.000000  100.000000  100.000000     1.000000e+02     1.000000e+02   
mean   100.521199   46.16

In [12]:
dc.save_data(df_processed, "processed_data.csv")