In [1]:
import pandas as pd
import numpy as np

# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [2]:
# Fill missing values with the mean for numeric columns
df_filled = df_dummy.fillna(df_dummy.mean())

# Fill missing categorical data with the mode (most frequent value)
df_filled['Category'].fillna(df_filled['Category'].mode()[0], inplace=True)

print(df_filled.isnull().sum())  # Verify that there are no missing values

  df_filled = df_dummy.fillna(df_dummy.mean())


In [3]:
from scipy import stats

# Calculate Z-scores for numerical features
z_scores = np.abs(stats.zscore(df_filled.select_dtypes(include=[np.number])))

# Remove rows with any Z-scores greater than 3 (commonly used threshold for outliers)
df_no_outliers = df_filled[(z_scores < 3).all(axis=1)]

print(df_no_outliers.describe())  # Verify that outliers have been removed

         Feature1    Feature2      Target
count  101.000000  101.000000  101.000000
mean   100.607824   46.029703    0.534653
std     10.079298   27.147175    0.501285
min     74.470102    0.000000    0.000000
25%     93.656779   28.000000    0.000000
50%    101.216750   41.000000    1.000000
75%    107.290906   69.000000    1.000000
max    122.697546   97.000000    1.000000


In [4]:
from sklearn.preprocessing import StandardScaler

# Scale numeric features using StandardScaler (Z-score normalization)
scaler = StandardScaler()
df_no_outliers[df_no_outliers.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df_no_outliers.select_dtypes(include=[np.number]))

print(df_no_outliers.head())  # Verify that the data has been scaled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers[df_no_outliers.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df_no_outliers.select_dtypes(include=[np.number]))


In [5]:
# One-hot encode the categorical feature
df_encoded = pd.get_dummies(df_no_outliers, columns=['Category'])

print(df_encoded.head())  # Verify that the categorical variable has been encoded

   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698298 -0.519379  0.932936           1           0           0   
1  0.338384  0.887380  0.932936           0           1           0   
2  0.915276  1.442679 -1.071884           0           0           1   
3  2.173747 -0.556399  0.932936           0           0           0   
4  1.801501 -1.222759 -1.071884           1           0           0   

   Category_D  
0           0  
1           0  
2           0  
3           1  
4           0  


In [6]:
# Save the preprocessed DataFrame to a CSV file
df_encoded.to_csv('preprocessed_dummy_data.csv', index=False)

print('Preprocessed data saved as preprocessed_dummy_data.csv')

Preprocessed data saved as preprocessed_dummy_data.csv
