In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
pip install charset-normalizer



In [25]:
import pandas as pd
import os
from charset_normalizer import from_path

# Define directory with raw data CSV files
input_directory = '/content/drive/MyDrive/input_directory_liberator'
output_directory = '/content/drive/MyDrive/output_directory_liberator'

# Create output directory if it does not exist
os.makedirs(output_directory, exist_ok=True)

# List all CSV files in the input directory
csv_files = [file for file in os.listdir(input_directory) if file.endswith('.csv')]

# Define cleaning function
def clean_data(df):
    # Example cleaning steps
    for column in df.columns:
        if df[column].dtype == 'object':
            # Fill nulls in categorical columns with 'Unknown'
            df[column].fillna('Unknown', inplace=True)
        else:
            # Fill nulls in numerical columns with the median
            df[column].fillna(df[column].median(), inplace=True)
    return df

# Process each file
for file in csv_files:
    # Read the CSV file
    file_path = os.path.join(input_directory, file)

    result = from_path(file_path).best()
    encoding = result.encoding if result else 'utf-8'
    print(f"Detected encoding: {encoding}")

    # cp-1250 to utf-8
    # if encoding.lower() != 'utf-8':
    #     with open(file_path, 'r', encoding=encoding.lower()) as file:
    #         content = file.read()

    #     # Write content to a new file with UTF-8 encoding
    #     with open(file_path, 'w', encoding='utf-8') as file:
    #         file.write(content)

    df = pd.read_csv(file_path, encoding='utf-8')

    # Clean the data
    df_cleaned = clean_data(df)

    # Save the cleaned file
    output_path = os.path.join(output_directory, file)
    df_cleaned.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

print("All files have been processed and cleaned.")


Detected encoding: ascii
Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Product Subcategories.csv
Detected encoding: ascii
Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Product Categories.csv
Detected encoding: ascii
Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Sales Data 2020.csv
Detected encoding: ascii
Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Product.csv
Detected encoding: ascii


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Sales Data 2021.csv
Detected encoding: ascii
Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Sales Data 2022.csv
Detected encoding: utf_8


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

Cleaned data saved to /content/drive/MyDrive/output_directory_liberator/Customer.csv
All files have been processed and cleaned.
