In [1]:
import pandas as pd

# Load the dataset
#Ensure your dataset data lesson 01.csv is in the same folder as your notebook
file_path = "data_lesson_01.csv"
data = pd.read_csv(file_path)

# Display basic information about the dataset
print(data.info())
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   numerical_var_1  9961 non-null   float64
 1   numerical_var_2  9960 non-null   float64
 2   numerical_var_3  9960 non-null   float64
 3   numerical_var_4  9960 non-null   float64
 4   categorical_var  9960 non-null   object 
dtypes: float64(4), object(1)
memory usage: 390.8+ KB
None


Unnamed: 0,numerical_var_1,numerical_var_2,numerical_var_3,numerical_var_4,categorical_var
0,-1000000000.0,-0.6784947,0.348286,-1.980572,High
1,-0.1382643,-1000000000.0,0.283324,-1.054986,High
2,0.6476885,-0.5973811,-0.93652,-0.587028,-999999999
3,1.52303,0.110418,0.579584,0.149669,High
4,-0.2341534,1.197179,-1.490083,1.024162,High


In [2]:
#What percentage of the data is missing?
# Calculate missing data percentage
#.isnull() function returns a DataFrame True or False to identify missing (or null) values in a dataset.
#True indicates a missing value (np.nan), and False indicates a non-missing value
#.sum(): Counts the number of True values in each column (i.e., the total number of missing values in that column).
#len(data): Returns the total number of rows in the dataset

missing_percentage = (data.isnull().sum() / len(data)) * 100

# Display the percentage of missing values for each column
print("Percentage of missing values in each column:")
print(missing_percentage)


Percentage of missing values in each column:
numerical_var_1    0.39
numerical_var_2    0.40
numerical_var_3    0.40
numerical_var_4    0.40
categorical_var    0.40
dtype: float64


In [3]:
# Are there any data points that need to be removed?
# Define a threshold for removal (e.g., rows with >30% missing values)
threshold = 30  # Change this based on analysis needs

#identifies rows where the number of missing values exceeds the threshold
#Check rows and columns with excessive missing values
#data.isnull().sum(axis=1): Counts the number of missing values for each row
#(len(data.columns) * (threshold / 100)) is the allowed number of missing values based on the threshold.
rows_to_remove = data[data.isnull().sum(axis=1) > (len(data.columns) * (threshold / 100))]
columns_to_remove = missing_percentage[missing_percentage > threshold]

print(f"Number of rows to remove: {len(rows_to_remove)}")
print("Columns to remove:", list(columns_to_remove.index))

#If no row in the dataset has more than 30% missing values, the output will be 0
#If all columns have less than 30% missing values, the output will be: []


Number of rows to remove: 0
Columns to remove: []


Removing rows with excessive missing values ensures that observations (data points) with incomplete information don’t bias or degrade analysis.
Removing columns with excessive missing values ensures that features (variables) with insufficient data don’t negatively impact model training or insights.

be stricter or more lenient, adjust the threshold:
Higher Threshold: Fewer rows/columns removed, tolerating more missing data.
Lower Threshold: More rows/columns removed, demanding more complete data.


In [7]:
# Is the target variable well-balanced?
#Analyze the distribution of the target variable
#target variable is in the last column of the dataset.
#Calculates the proportion of each class in the target variable as a percentage.
#normalize=True ensures the values are fractions


target_variable = data.columns[-1]
target_distribution = data[target_variable].value_counts(normalize=True) * 100

print("Target variable distribution:")
print(target_distribution)
# Sets the threshold for imbalance (e.g., 10%).
#If any class has less than 10% representation, it's flagged as imbalanced.
# Determine if it's balanced (e.g., no class <10% or >90%)
balance_threshold = 10  # Example: Consider imbalance if a class is <10%
imbalanced = target_distribution[target_distribution < balance_threshold]
# If no class meets the imbalance condition, the target variable is considered well-balanced.
if imbalanced.empty:
    print("The target variable is well-balanced.")
else:
    print(f"The target variable is imbalanced. Classes with low representation:\n{imbalanced}")


Target variable distribution:
categorical_var
High          90.321285
Low            9.668675
-999999999     0.010040
Name: proportion, dtype: float64
The target variable is imbalanced. Classes with low representation:
categorical_var
Low           9.668675
-999999999    0.010040
Name: proportion, dtype: float64


Threshold for Balance:

The choice of balance_threshold depends on your specific use case:
Strict: 10% is common. Classes with less than 10% representation are flagged as imbalanced.
Lenient: Thresholds like 5% or 20% can be used for stricter or looser definitions of balance.

Well-Balanced Target Variable:

All classes are adequately represented.
The model learns from all classes and avoids bias toward the majority class.
Example: Binary classification with a 50/50 split between classes.
Imbalanced Target Variable:

If one or more classes have very low representation, the model might:
Struggle to learn from underrepresented classes.
Predict the majority class most of the time (e.g., always predicting "High" in a 90% "High" dataset).

The target variable is imbalanced. Classes with low representation:
Low           9.67
-999999999    0.01
This indicates the target variable is dominated by "High" and lacks sufficient representation for "Low" and -999999999.


Analysis of Results:
Missing Data

Percentage of missing values: Each column has a small amount of missing data (around 0.39%-0.40%). This is negligible in a dataset of 10,000 rows, so dropping rows or columns for this level of missingness isn't necessary.
No rows or columns flagged for removal: This aligns with the minimal missingness.
Outliers and Invalid Values

The data contains extreme and likely invalid values:
numerical_var_1: A value of -1.000000e+09 (likely an invalid placeholder or error).
numerical_var_2: Similarly contains -1.000000e+09.
categorical_var: Includes -999999999, which seems invalid.
These invalid data points need to be addressed by replacing or removing them.

Target Variable

The target variable is heavily imbalanced:
Class "High": 90.32%.
Class "Low": 9.67%.
Class "-999999999": 0.01% (invalid and needs to be removed).
This imbalance suggests that resampling (e.g., oversampling "Low" or undersampling "High") may be needed for balanced training data.

In [7]:
#Clean the dataset by handling missing values and removing any invalid data points.
# .dropna() is used to remove rows that have any missing values

cleaned_data = data.dropna()
print(f"Rows after dropping missing values: {len(cleaned_data)}")
# The percentage of missing values is low (about 0.4%), so removing a small fraction of rows won't significantly impact the dataset.


Rows after dropping missing values: 9801


In [8]:
# Replace invalid numeric values with NaN, then drop these rows
# Define invalid values for numeric and categorical variables
invalid_numeric = -1.0e+09
invalid_categorical = "-999999999"

# Replace invalid numeric values with NaN and drop these rows
cleaned_data = cleaned_data.replace(invalid_numeric, pd.NA)
cleaned_data = cleaned_data.dropna()

# Remove rows with invalid categorical values
cleaned_data = cleaned_data[cleaned_data['categorical_var'] != invalid_categorical]

# Remove duplicate rows (if any)
cleaned_data = cleaned_data.drop_duplicates()

print(f"Rows after cleaning invalid values: {len(cleaned_data)}")

Rows after cleaning invalid values: 9800


In [10]:
# resample function is used to create upsampled or downsampled datasets.
!pip install scikit-learn

from sklearn.utils import resample
import sklearn
print(sklearn.__version__)

# Separate classes dynamically
class_counts = cleaned_data['categorical_var'].value_counts()#Computes the count of each unique value in the categorical_var column.
majority_class = class_counts.idxmax()  # Identifies the category with the maximum count (i.e., the majority class).
#Filters the dataset to include only rows where categorical_var equals the majority class.
majority = cleaned_data[cleaned_data['categorical_var'] == majority_class]

# Balance all other classes against the majority class
balanced_data = majority.copy()#Makes a copy of the majority class dataset to initialize balanced_data

for category in class_counts.index: #Iterates over each category (e.g., "High", "Low", "Medium") in the categorical_var column.
    if category != majority_class:#Ensures that the loop skips the majority class, as it's already included in balanced_data.
        category_data = cleaned_data[cleaned_data['categorical_var'] == category]#Filters the dataset to include only rows belonging to the current category (e.g., "Low"
        #Upsamples the category_data to match the size of the majority class (len(majority)):
        upsampled = resample(
            category_data,
            replace=True,#Allows sampling with replacement, enabling duplicates in the upsampled dataset.
            n_samples=len(majority),  # Sets the size of the upsampled dataset to equal the majority class size.
            random_state=42
        )
        #Appends the upsampled category data to the balanced_data dataframe.
        balanced_data = pd.concat([balanced_data, upsampled])

# sampling 100% of the rows in a randomized order.
# Shuffle the dataset after balancing
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Displays the count of each category in the balanced dataset to confirm all classes have the same size.
# Check class distribution
print("Class distribution after balancing:")
print(balanced_data['categorical_var'].value_counts())



1.6.1
Class distribution after balancing:
categorical_var
High    8848
Low     8848
Name: count, dtype: int64



[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
#imports the train_test_split function for splitting the dataset.
from sklearn.model_selection import train_test_split

# X: Contains all columns except categorical_var, which represents the features.
# Contains only the categorical_var column, which is the target.
# Define features (X) and target (y)
X = balanced_data.drop(columns='categorical_var')
y = balanced_data['categorical_var']

# test_size=0.2: Allocates 20% of the dataset for testing.
# 
# Split into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# X_train and X_test are saved as X_train.csv and X_test.csv.
# y_train and y_test are saved as y_train.csv and y_test.csv.
# Save subsets as CSV files
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Training and test subsets saved as independent files.")


Training and test subsets saved as independent files.
