# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [35]:
# import pandas and load cancer data
import pandas as pd
df=pd.read_csv('cancer_data_means.csv')
# check which columns have missing values
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      569 non-null    int64  
 1   diagnosis               569 non-null    object 
 2   radius_mean             569 non-null    float64
 3   texture_mean            548 non-null    float64
 4   perimeter_mean          569 non-null    float64
 5   area_mean               569 non-null    float64
 6   smoothness_mean         521 non-null    float64
 7   compactness_mean        569 non-null    float64
 8   concavity_mean          569 non-null    float64
 9   concave_points_mean     569 non-null    float64
 10  symmetry_mean           504 non-null    float64
 11  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.5+ KB


In [10]:
# use the mean to fill in missing values
df['texture_mean'].fillna(df['texture_mean'].mean,inplace=True)
df['smoothness_mean'].fillna(df['smoothness_mean'].mean,inplace=True)
df['symmetry_mean'].fillna(df['symmetry_mean'].mean,inplace=True)
# confirm your correction 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      569 non-null    int64  
 1   diagnosis               569 non-null    object 
 2   radius_mean             569 non-null    float64
 3   texture_mean            569 non-null    object 
 4   perimeter_mean          569 non-null    float64
 5   area_mean               569 non-null    float64
 6   smoothness_mean         569 non-null    object 
 7   compactness_mean        569 non-null    float64
 8   concavity_mean          569 non-null    float64
 9   concave_points_mean     569 non-null    float64
 10  symmetry_mean           569 non-null    object 
 11  fractal_dimension_mean  569 non-null    float64
dtypes: float64(7), int64(1), object(4)
memory usage: 53.5+ KB


In [23]:
# how many duplicates are there ?
sum(df.duplicated())

0

In [12]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [13]:
# confirm correction by rechecking for duplicates in the data
sum(df.duplicated())

0

## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [37]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)

remove_mean = lambda x: x.replace('_mean', '') 

df = df.rename(columns=remove_mean)

In [38]:
# display first few rows of the dataframe to confirm changes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   radius             569 non-null    float64
 3   texture            548 non-null    float64
 4   perimeter          569 non-null    float64
 5   area               569 non-null    float64
 6   smoothness         521 non-null    float64
 7   compactness        569 non-null    float64
 8   concavity          569 non-null    float64
 9   concave_points     569 non-null    float64
 10  symmetry           504 non-null    float64
 11  fractal_dimension  569 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.5+ KB


In [39]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
df.to_csv('cancer_data_edited.csv', index=False)
# To make it ignore the index

In [40]:
df

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,fractal_dimension
0,842302,M,17.99,,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,,0.28390,0.24140,0.10520,0.2597,0.09744
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016
