In [28]:
import pandas as pd
import numpy as np

# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [29]:
def load_data(df):
    return df

In [30]:
def handle_missing_values(df):
    for column in df.columns:
        if df[column].dtype == 'object':  # Categorical column
            mode_value = df[column].mode()[0]  # Get the mode
            df[column].fillna(mode_value, inplace=True)  # Replace missing values with mode
        else:  # Numeric column
            median_value = df[column].median()  # Get the median
            df[column].fillna(median_value, inplace=True)  # Replace missing values with median
    return df  # Return the modified DataFrame

In [31]:
def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    return df[(z_scores < 3).all(axis=1)]  # Remove rows with any outliers

In [32]:
def scale_data(df):
    scaler = StandardScaler()
    df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))
    return df

In [33]:
def encode_categorical(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns)

In [13]:
def save_data(df, output_filepath):
    df.to_csv(output_filepath, index=False)

In [23]:
# Load the data
df_preprocessed = load_data(df_dummy)

# Handle missing values
df_preprocessed = handle_missing_values(df_preprocessed)

# Remove outliers
df_preprocessed = remove_outliers(df_preprocessed)

# Scale the data
df_preprocessed = scale_data(df_preprocessed)

# Encode categorical variables
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Display the preprocessed data
print(df_preprocessed.head())

NameError: name 'stats' is not defined

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [35]:
# Load the data
df_preprocessed = load_data(df_dummy)

# Handle missing values
df_preprocessed = handle_missing_values(df_preprocessed)

# Remove outliers
df_preprocessed = remove_outliers(df_preprocessed)

# Scale the data
df_preprocessed = scale_data(df_preprocessed)

# Encode categorical variables
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Display the preprocessed data
print(df_preprocessed.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))


In [36]:
# Save the cleaned and preprocessed DataFrame to a CSV file
save_data(df_preprocessed, 'preprocessed_dummy_data.csv')

print('Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv')

Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv


In [37]:
print(df_preprocessed.isnull().sum())

Feature1      0
Feature2      0
Target        0
Category_A    0
Category_B    0
Category_C    0
Category_D    0
dtype: int64


In [38]:
print(df_preprocessed.describe())

           Feature1      Feature2        Target  Category_A  Category_B  \
count  1.010000e+02  1.010000e+02  1.010000e+02  101.000000  101.000000   
mean  -2.502948e-15 -3.407615e-17 -2.418308e-17    0.257426    0.247525   
std    1.004988e+00  1.004988e+00  1.004988e+00    0.439397    0.433727   
min   -2.605856e+00 -1.704018e+00 -1.071884e+00    0.000000    0.000000   
25%   -6.927344e-01 -6.674590e-01 -1.071884e+00    0.000000    0.000000   
50%    6.107739e-02 -1.861994e-01  9.329364e-01    0.000000    0.000000   
75%    6.667371e-01  8.503597e-01  9.329364e-01    1.000000    0.000000   
max    2.202948e+00  1.886919e+00  9.329364e-01    1.000000    1.000000   

       Category_C  Category_D  
count  101.000000  101.000000  
mean     0.247525    0.247525  
std      0.433727    0.433727  
min      0.000000    0.000000  
25%      0.000000    0.000000  
50%      0.000000    0.000000  
75%      0.000000    0.000000  
max      1.000000    1.000000  


In [39]:
print(df_preprocessed.head())

   Feature1  Feature2    Target  Category_A  Category_B  Category_C  \
0  1.698707 -0.519379  0.932936           1           0           0   
1  0.338755  0.887380  0.932936           0           1           0   
2  0.915663  1.442679 -1.071884           0           0           1   
3  2.174170 -0.556399  0.932936           0           0           0   
4  1.801914 -1.222759 -1.071884           1           0           0   

   Category_D  
0           0  
1           0  
2           0  
3           1  
4           0  


In [40]:
print(df_preprocessed.columns)

Index(['Feature1', 'Feature2', 'Target', 'Category_A', 'Category_B',
       'Category_C', 'Category_D'],
      dtype='object')
