In [1]:
import os  # Import the os module to interact with the file system
import pandas as pd  # Import pandas for data manipulation
import pandas as pd
os.environ["PYTHONUNBUFFERED"] = "1"

In [2]:
# Define the folder containing the CSV files
folder_path = "resources"

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]


In [3]:
# Expected number of columns
expected_columns = 9
incorrect_files = []


In [4]:
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)  # Get the full path of the file
    df = pd.read_csv(file_path,low_memory=False)  # Read the CSV file into a dataframe

    # Count the number of columns in the current CSV file
    num_columns = len(df.columns)

    # Print the number of columns and check if it differs from the expected number
    print(f"File: {csv_file} | Number of columns: {num_columns}")
    if num_columns > expected_columns:
        df = df.iloc[:, :expected_columns]  # Keep only the first 9 columns
        df.to_csv(file_path, index=False)  # Save the modified file
        print(f"Fixed: Removed extra column from {csv_file}")

# Print summary of files with incorrect column counts
if incorrect_files:
    print("\nSummary: The following files had an incorrect number of columns and were modified if necessary:")
    for file in incorrect_files:
        print(f"- {file}")
else:
    print("\nAll files have the correct number of columns.")

File: 2025-01 (1).csv | Number of columns: 9
File: 2024-11.csv | Number of columns: 9
File: 2024-10.csv | Number of columns: 9
File: 2024-12.csv | Number of columns: 9
File: 2024-06.csv | Number of columns: 9
File: datos_abiertos_2024_04.csv | Number of columns: 9
File: datos_abiertos_2024_07.csv | Number of columns: 9
File: 2024-09.csv | Number of columns: 9
File: 2024-08.csv | Number of columns: 9
File: 2024-05-1.csv | Number of columns: 9
File: 2025-02.csv | Number of columns: 9
File: datos_abiertos_2024_03-1-1.csv | Number of columns: 9

All files have the correct number of columns.


In [5]:
# Merge all CSV files, ignoring column headers
merged_df = pd.concat(
    [pd.read_csv(os.path.join(folder_path, f), header=None, skiprows=1, low_memory=False) for f in csv_files],
    ignore_index=True
)

# Define new column titles
new_column_names = [
    "Genero_Usuario", "Edad_Usuario", "Bici", "Ciclo_Estacion_Retiro",
    "Fecha_Retiro", "Hora_Retiro", "Ciclo_Estacion_Arribo", "Fecha_Arribo", "Hora_Arribo"
]

# Assign new column names
merged_df.columns = new_column_names

# Save the merged DataFrame as a new CSV file
merged_file_path = os.path.join(folder_path, "merged_file.csv")
merged_df.to_csv(merged_file_path, index=False)

print(f"\nMerged file created: {merged_file_path}")


Merged file created: resources/merged_file.csv


In [6]:
# Open and display the first few rows of the merged file
print("\nPreview of the merged file:")
merged_df.head()


Preview of the merged file:


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,26.0,5180930,568,31/12/2024,23:57:02,572,01/01/2025,00:00:03
1,F,54.0,3653953,283,31/12/2024,23:51:40,596,01/01/2025,00:00:41
2,M,38.0,7511322,34,31/12/2024,23:48:36,64,01/01/2025,00:00:59
3,M,41.0,3804572,258,31/12/2024,23:54:11,23,01/01/2025,00:01:08
4,M,35.0,3848405,43,31/12/2024,23:35:28,126,01/01/2025,00:01:17
