# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file, which you created in a previous section.

In [4]:
# import pandas and load cancer data
import pandas as pd
df = pd.read_csv("cancer_data_means.csv")
# check which columns have missing values with info()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 11 columns):
id                     569 non-null int64
diagnosis              569 non-null object
radius_mean            569 non-null float64
texture_mean           548 non-null float64
perimeter_mean         569 non-null float64
area_mean              569 non-null float64
smoothness_mean        521 non-null float64
compactness_mean       569 non-null float64
concavity_mean         569 non-null float64
concave_points_mean    569 non-null float64
symmetry_mean          504 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 49.0+ KB


In [7]:
# use means to fill in missing values
smoothness_mean = df["smoothness_mean"].mean()
df["smoothness_mean"].fillna(smoothness_mean,inplace = True)
symmetry_mean = df["symmetry_mean"].mean()
df["symmetry_mean"].fillna(symmetry_mean,inplace = True)
texture_mean = df["texture_mean"].mean()
df["texture_mean"].fillna(texture_mean,inplace = True)
# confirm your correction with info()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 11 columns):
id                     569 non-null int64
diagnosis              569 non-null object
radius_mean            569 non-null float64
texture_mean           569 non-null float64
perimeter_mean         569 non-null float64
area_mean              569 non-null float64
smoothness_mean        569 non-null float64
compactness_mean       569 non-null float64
concavity_mean         569 non-null float64
concave_points_mean    569 non-null float64
symmetry_mean          569 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 49.0+ KB


In [8]:
# check for duplicates in the data
df.duplicated()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
539    False
540    False
541    False
542    False
543    False
544    False
545    False
546    False
547    False
548    False
549    False
550    False
551    False
552    False
553    False
554    False
555    False
556    False
557    False
558     True
559    False
560    False
561    False
562    False
563    False
564    False
565    False
566    False
567    False
568    False
Length: 569, dtype: bool

In [11]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [12]:
# confirm correction by rechecking for duplicates in the data
df.duplicated()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
538    False
539    False
540    False
541    False
542    False
543    False
544    False
545    False
546    False
547    False
548    False
549    False
550    False
551    False
552    False
553    False
554    False
555    False
556    False
557    False
559    False
560    False
561    False
562    False
563    False
564    False
565    False
566    False
567    False
568    False
Length: 564, dtype: bool

## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Let's come up with a list of new labels to assign to our columns.

In [None]:
# remove "_mean" from column names
new_labels = []
for col in df.columns:
    if '_mean' in col:
        new_labels.append(col[:-5])  # exclude last 6 characters
    else:
        new_labels.append(col)

# new labels for our columns
new_labels

In [None]:
# assign new labels to columns in dataframe
df.columns = new_labels

# display first few rows of dataframe to confirm changes
df.head()

In [None]:
# save this for later
df.to_csv('cancer_data_edited.csv', index=False)