In [8]:
# Importing required libraries
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('C:\\Users\\ADMIN\\Downloads\\career_change_prediction_dataset.csv')

# Step 1: Rename 'work-life_balance' to 'work_life_balance'
df.rename(columns={'Work-Life Balance':'Work_Life_Balance'}, inplace=True)

print("Columns in the dataset:")
print(df.columns)

# Step 2: Convert 'industry_growth_rate' values from text to decimals
growth_rate_mapping = {'High': 1.0, 'Medium': 0.5, 'Low': 0.1}
df['Industry Growth Rate'] = df['Industry Growth Rate'].str.strip().replace(growth_rate_mapping)

# Step 3: Check basic information
print("Initial Dataset Overview")
print(df.head(10))
print(df.info())
print(df.describe())

# Step 4: Remove duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")
df = df.drop_duplicates()

# Step 5: Check and handle missing values
print("\nMissing Values by Column:")
print(df.isnull().sum())

# Handle missing values in 'Family Influence'
df['Family Influence'] = df['Family Influence'].replace(np.nan, 'None')

# Step 6: Clean and standardize categorical columns
# Standardize 'Field of Study' and 'Current Occupation'
df['Field of Study'] = df['Field of Study'].str.strip().str.title()
df['Current Occupation'] = df['Current Occupation'].str.strip().str.title()

# Standardize 'Gender'
df['Gender'] = df['Gender'].str.strip().str.capitalize()

# Standardize 'Family Influence'
df['Family Influence'] = df['Family Influence'].str.strip().str.lower()

# Step 7: Validate and clean numeric columns
# Ensure values in 'Age' are within a reasonable range
df = df[(df['Age'] >= 15) & (df['Age'] <= 100)]

# Validate 'Years of Experience'
df = df[df['Years of Experience'] <= df['Age']]

# Normalize numeric columns if needed
numeric_columns = [
    'Job Satisfaction', 'Work_Life_Balance', 'Job Opportunities', 
    'Skills Gap', 'Technology Adoption', 'Career Change Events'
]
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(df[col].median())  # Fill missing values with median

# Step 8: Standardize binary columns
binary_map = {'Yes': 1, 'No': 0}
binary_columns = ['Mentorship Available', 'Certifications', 'Freelancing Experience', 'Geographic Mobility']
for col in binary_columns:
    df[col] = df[col].replace(binary_map)

# Step 9: Normalize column names for MySQL compatibility
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Step 10: Final check for missing values and unique values in key columns
print("\nFinal Missing Values Check:")
print(df.isnull().sum())

print("\nUnique Values in 'Family Influence':")
print(df['family_influence'].unique())

# Step 11: Export cleaned dataset
df.to_csv('C:\\Users\\ADMIN\\Downloads\\cleaned_dataset.csv', index=False, encoding='utf-8')

print("\nData cleaning complete. Cleaned dataset exported.")


Columns in the dataset:
Index(['Field of Study', 'Current Occupation', 'Age', 'Gender',
       'Years of Experience', 'Education Level', 'Industry Growth Rate',
       'Job Satisfaction', 'Work_Life_Balance', 'Job Opportunities', 'Salary',
       'Job Security', 'Career Change Interest', 'Skills Gap',
       'Family Influence', 'Mentorship Available', 'Certifications',
       'Freelancing Experience', 'Geographic Mobility',
       'Professional Networks', 'Career Change Events', 'Technology Adoption',
       'Likely to Change Occupation'],
      dtype='object')
Initial Dataset Overview
     Field of Study  Current Occupation  Age  Gender  Years of Experience  \
0          Medicine    Business Analyst   48    Male                    7   
1         Education           Economist   44    Male                   26   
2         Education           Biologist   21  Female                   27   
3         Education    Business Analyst   33    Male                   14   
4              Arts   

  df['Industry Growth Rate'] = df['Industry Growth Rate'].str.strip().replace(growth_rate_mapping)



Missing Values by Column:
Field of Study                    0
Current Occupation                0
Age                               0
Gender                            0
Years of Experience               0
Education Level                   0
Industry Growth Rate              0
Job Satisfaction                  0
Work_Life_Balance                 0
Job Opportunities                 0
Salary                            0
Job Security                      0
Career Change Interest            0
Skills Gap                        0
Family Influence               9632
Mentorship Available              0
Certifications                    0
Freelancing Experience            0
Geographic Mobility               0
Professional Networks             0
Career Change Events              0
Technology Adoption               0
Likely to Change Occupation       0
dtype: int64

Final Missing Values Check:
field_of_study                 0
current_occupation             0
age                            0
gend

In [5]:
df.columns

Index(['field_of_study', 'current_occupation', 'age', 'gender',
       'years_of_experience', 'education_level', 'industry_growth_rate',
       'job_satisfaction', 'work-life_balance', 'job_opportunities', 'salary',
       'job_security', 'career_change_interest', 'skills_gap',
       'family_influence', 'mentorship_available', 'certifications',
       'freelancing_experience', 'geographic_mobility',
       'professional_networks', 'career_change_events', 'technology_adoption',
       'likely_to_change_occupation'],
      dtype='object')