In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Load the dataset
data = pd.read_csv('Downloads/uncleandata.csv')

In [4]:
# Check the number of rows and columns
num_rows = data.shape[0]
num_columns = data.shape[1]
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 15099
Number of columns: 11


In [5]:
# Get the column names
column_names = data.columns.tolist()
print("Column names:", column_names)

Column names: ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'work_accident', 'left', 'promotion_last_5years', 'is_smoker', 'department', 'salary']


In [6]:
# Get the number of features and their names
num_features = num_columns - 1  # Excluding the target column ('left')
feature_names = column_names[:-1]
print("Number of features:", num_features)
print("Feature names:", feature_names)

Number of features: 10
Feature names: ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'work_accident', 'left', 'promotion_last_5years', 'is_smoker', 'department']


In [7]:
# Check for duplicates
num_duplicates = data.duplicated().sum()
if num_duplicates > 0:
    print("Number of duplicates:", num_duplicates)
    data.drop_duplicates(inplace=True)
    print("Duplicates removed.")
else:
    print("No duplicates found.")

Number of duplicates: 2840
Duplicates removed.


In [8]:
# Print the distributions of each feature
feature_distributions = data.describe(include='all').T
print(feature_distributions)

                         count unique    top   freq        mean       std  \
satisfaction_level     12259.0    NaN    NaN    NaN    0.628149  0.241893   
last_evaluation        12259.0    NaN    NaN    NaN    0.716666  0.168627   
number_project         12259.0    NaN    NaN    NaN    3.804389  1.170172   
average_montly_hours   11891.0    NaN    NaN    NaN  200.511732  48.84344   
time_spend_company     12109.0    NaN    NaN    NaN    3.380048  1.356414   
work_accident          12259.0    NaN    NaN    NaN    0.153928  0.360894   
left                     12259      2     no  10144         NaN       NaN   
promotion_last_5years  12259.0    NaN    NaN    NaN    0.016886  0.128848   
is_smoker                  235      2     no    180         NaN       NaN   
department               12259     10  sales   3321         NaN       NaN   
salary                   12259      3    low   5872         NaN       NaN   

                        min    25%    50%    75%    max  
satisfaction_leve

In [9]:
# Check for missing values
num_missing_values = data.isnull().sum().sum()
missing_features = data.columns[data.isnull().any()].tolist()
print("Number of missing values:", num_missing_values)
print("Features with missing values:", missing_features)

Number of missing values: 12542
Features with missing values: ['average_montly_hours', 'time_spend_company', 'is_smoker']


In [10]:
# Identify the feature with the largest number of missing values
feature_with_most_missing = data.isnull().sum().idxmax()
print("Feature with the largest number of missing values:", feature_with_most_missing)

# Drop the feature with the largest number of missing values
data.drop(columns=feature_with_most_missing, inplace=True)
print("Feature with the largest number of missing values dropped:", feature_with_most_missing)

# Fill missing values with the mean of numeric columns
numeric_columns = data.select_dtypes(include='number').columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())
print("Missing values filled with mean for numeric columns.")

Feature with the largest number of missing values: is_smoker
Feature with the largest number of missing values dropped: is_smoker
Missing values filled with mean for numeric columns.


In [11]:
# Check for missing values again
num_missing_values_after_fill = data.isnull().sum().sum()
print("Number of missing values after fill:", num_missing_values_after_fill)

Number of missing values after fill: 0


In [12]:
# Convert 'yes' and 'no' values in 'left' column to 1 and 0
data['left'] = data['left'].map({'yes': 1, 'no': 0})

In [13]:
# Save the resulting data into a file
data.to_csv('p1_data.csv', index=False)

In [14]:
# Divide the data into training set (80%) and test set (20%)
X = data.drop(columns=['left'])
y = data['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Number of data points in each set
num_train_samples = X_train.shape[0]
num_test_samples = X_test.shape[0]
print("Number of samples in training set:", num_train_samples)
print("Number of samples in test set:", num_test_samples)

# Stratified sampling based on the ratio of "yes" and "no" values of the target variable 'left'
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Number of data points in each set (stratified)
num_train_samples_strat = X_train_strat.shape[0]
num_test_samples_strat = X_test_strat.shape[0]
print("Number of samples in stratified training set:", num_train_samples_strat)
print("Number of samples in stratified test set:", num_test_samples_strat)

Number of samples in training set: 9807
Number of samples in test set: 2452
Number of samples in stratified training set: 9807
Number of samples in stratified test set: 2452
