In [None]:
import pandas as pd
import numpy as np

In [None]:
def clean_data(input_file, output_file):
    if input_file.endswith('.csv'):
        # Load CSV file into a DataFrame
        df = pd.read_csv(input_file)

    elif input_file.endswith('.xlsx'):
        # Load XLSX file into a DataFrame
        df = pd.read_excel(input_file)

    else:
        print("Unsupported file format.")
        return

    #Unifying Date Columns to Datetime type and filling null with '1900-01-01'
    columns_to_convert = ['Board interview date', 'Created', 'Modified','Technical interview date']
    for column in columns_to_convert:
        default_datetime = '1900-01-01'
        df[column] = pd.to_datetime(df[column])
        df[column] = df[column].fillna(pd.to_datetime(default_datetime))


    #Removing Special Characters
    df['Applicant Name'] = df['Applicant Name'].str.replace(r'[^a-zA-Z0-9@\,\.\-\s]', '', regex=True)
    df['emailAddress'] = df['emailAddress'].str.replace(r'[^a-zA-Z0-9@\,\.\-\s]', '', regex=True)

    # Handle null values by replacing [] with NULL
    df['fit_to_account'].replace('[]', 'None', inplace=True)

    #Handling Null Values in Object Columns
    columns_to_Handle = ['Applicant Name', 'Contact number', 'Interviewers names','Years of experience']
    for column in columns_to_Handle:
      df[column]=df[column].fillna(value='None',inplace=False)


    # Check for duplicates
    duplicate_rows = df[df.duplicated()]
    if not duplicate_rows.empty:
        print("Duplicate rows found:")
        print(duplicate_rows)

    # Export cleaned data to another CSV or Excel file based on the output_file extension
    if output_file.endswith('.csv'):
        df.to_csv(output_file, index=False)
    elif output_file.endswith('.xlsx'):
        df.to_excel(output_file, index=False)
    else:
        print("Unsupported output file format.")
        return

    print(f"Cleaned data saved to {output_file}")

In [None]:
input_file = '/content/Applicants.xlsx'
output_file = '/content/Applicants (Preprocessed).xlsx'

clean_data(input_file, output_file)