In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Import the data from a CSV file
data = pd.read_csv('Breast_cancer_dataset.csv')

print("Original Data:")
print(data.head())

# Step 2: Data Cleaning and Preparation

# 2a. Drop duplicate rows
data = data.drop_duplicates()

# 2b. Fill missing values in 'Age' with the median age
if 'Age' in data.columns:
    median_age = data['Age'].median()
    data['Age'].fillna(median_age, inplace=True)

# 2c. Convert 'Date' to datetime format, coerce errors to NaT
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

# 2d. Fill missing 'Score' with the mean score
if 'Score' in data.columns:
    mean_score = data['Score'].mean()
    data['Score'].fillna(mean_score, inplace=True)

print("\nCleaned Data:")
print(data.head())

# Step 3: Export cleaned data to a new CSV file
data.to_csv('cleaned_data.csv', index=False)
print("\nCleaned data exported to 'cleaned_data.csv'")

# Step 4: Visualization - Plot average score by month

# Extract month-year from Date for grouping (skip rows with invalid dates)
data_valid_dates = data.dropna(subset=['Date'])
data_valid_dates['Month-Year'] = data_valid_dates['Date'].dt.to_period('M')

# Group by Month-Year and calculate mean Score
monthly_avg_score = data_valid_dates.groupby('Month-Year')['Score'].mean()

# Plot
plt.figure(figsize=(10, 5))
monthly_avg_score.plot(kind='bar', color='skyblue')
plt.title('Average Score by Month-Year')
plt.xlabel('Month-Year')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Original Data:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_wo

KeyError: ['Date']