In [2]:
import pandas as pd
import os
 
# Load the CSV file
df = pd.read_csv('expansion_outcome.csv')
 
# Display the first few rows of the dataframe to understand its structure
print("Original Data:")
print(df.head())
 
# Clean the data
# Fill missing values with the mean, median, or mode
for column in df.columns:
    if df[column].dtype == 'object':
        # Fill missing values with the mode for categorical columns
        df[column] = df[column].fillna(df[column].mode()[0])
    else:
        # Fill missing values with the mean for numerical columns
        df[column] = df[column].fillna(df[column].mean())
 
# Remove duplicate rows
df_cleaned = df.drop_duplicates()
 
# Convert score_date to datetime format
df_cleaned['score_date'] = pd.to_datetime(df_cleaned['score_date'], format='%d-%m-%Y')
 
# Display the cleaned data
print("\nCleaned Data:")
print(df_cleaned.head())
 
# Ensure the directory exists
output_dir = 'files'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 
# Save the cleaned data to a new CSV file
output_path = os.path.join(output_dir, 'cleaned_expansion.csv')
df_cleaned.to_csv(output_path, index=False)
 
print("Data cleaning completed successfully!")
 

Original Data:
              id  score_date  current_arr  future_arr  arr_change  fx_impact  \
0   JMAN_2429072  10-02-2023      2976.00     2976.00        0.00       0.00   
1   JMAN_9043466  13-01-2023     20724.72    21474.84      499.56     250.56   
2   JMAN_2452556  24-02-2023       605.76      817.80      181.92      30.12   
3   JMAN_2455113  03-03-2023       300.00      300.00        0.00       0.00   
4  JMAN_18127937  10-02-2023      2100.00      420.00    -1680.00       0.00   

   seat_change_arr  product_change_arr  
0              0.0                0.00  
1              0.0              499.56  
2              0.0                0.00  
3              0.0                0.00  
4          -1680.0                0.00  

Cleaned Data:
              id score_date  current_arr  future_arr  arr_change  fx_impact  \
0   JMAN_2429072 2023-02-10      2976.00     2976.00        0.00       0.00   
1   JMAN_9043466 2023-01-13     20724.72    21474.84      499.56     250.56   
2   JM