In [3]:
# 1) ID number
# 2) Diagnosis (M = malignant, B = benign)
# 3-32)

# Ten real-valued features are computed for each cell nucleus:

# 	a) radius (mean of distances from center to points on the perimeter)
# 	b) texture (standard deviation of gray-scale values)
# 	c) perimeter
# 	d) area
# 	e) smoothness (local variation in radius lengths)
# 	f) compactness (perimeter^2 / area - 1.0)
# 	g) concavity (severity of concave portions of the contour)
# 	h) concave points (number of concave portions of the contour)
# 	i) symmetry 
# 	j) fractal dimension ("coastline approximation" - 1)

In [4]:
# import pandas and load cancer data
import pandas as pd
df=pd.read_csv('cancer_data_means.csv')

# check which columns have missing values with info()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      569 non-null    int64  
 1   diagnosis               569 non-null    object 
 2   radius_mean             569 non-null    float64
 3   texture_mean            548 non-null    float64
 4   perimeter_mean          569 non-null    float64
 5   area_mean               569 non-null    float64
 6   smoothness_mean         521 non-null    float64
 7   compactness_mean        569 non-null    float64
 8   concavity_mean          569 non-null    float64
 9   concave_points_mean     569 non-null    float64
 10  symmetry_mean           504 non-null    float64
 11  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.5+ KB


there is missing data in Texture_mean ,smoothness_mean,symmetry_mean columns

In [5]:
#display head of data file 
df.head(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,,0.05742
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243


In [6]:
#describe data
df.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
count,569.0,569.0,548.0,569.0,569.0,521.0,569.0,569.0,569.0,504.0,569.0
mean,30514670.0,14.113021,19.293431,91.877909,653.288576,0.096087,0.104536,0.08862,0.048837,0.181091,0.062833
std,125041700.0,3.506148,4.327287,24.162787,349.476899,0.013924,0.052674,0.079011,0.038578,0.027899,0.007051
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996
25%,869104.0,11.7,16.1675,75.17,420.3,0.08605,0.06526,0.02956,0.02036,0.1618,0.0578
50%,906024.0,13.37,18.785,86.34,551.1,0.09578,0.09453,0.06155,0.0337,0.17895,0.06154
75%,8910251.0,15.78,21.825,103.8,782.7,0.1048,0.1305,0.1319,0.07404,0.19575,0.06615
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744


In [7]:
# use means to fill in missing values
# some columns are not numeric, only calculate mean on numeric columns
# by using .mean(numeric_only=True)
mean=df['texture_mean'].mean()
df['texture_mean']=df['texture_mean'].fillna(mean)
mean=df['smoothness_mean'].mean()
df['smoothness_mean']=df['smoothness_mean'].fillna(mean)
mean=df['symmetry_mean'].mean()
df['symmetry_mean']=df['symmetry_mean'].fillna(mean)
# confirm your correction with info()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      569 non-null    int64  
 1   diagnosis               569 non-null    object 
 2   radius_mean             569 non-null    float64
 3   texture_mean            569 non-null    float64
 4   perimeter_mean          569 non-null    float64
 5   area_mean               569 non-null    float64
 6   smoothness_mean         569 non-null    float64
 7   compactness_mean        569 non-null    float64
 8   concavity_mean          569 non-null    float64
 9   concave_points_mean     569 non-null    float64
 10  symmetry_mean           569 non-null    float64
 11  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.5+ KB


In [8]:
# check for duplicates in the data
sum(df.duplicated())

5

In [9]:
# drop duplicates
df.drop_duplicates(inplace=True)


In [10]:
# confirm correction by rechecking for duplicates in the data
sum(df.duplicated())

0

Renaming Columns

Every column except for id and diagnosis has _mean at the end. Since we know that these are all means, this extra string seems unnecessary. Let's rename all columns to remove _mean from the end where applicable

In [11]:
# remove "_mean" from column names
new_labels = []
for col in df.columns:
    if '_mean' in col:
        new_labels.append(col[:-5])  # exclude last 5 characters
    else:
        new_labels.append(col)

# new labels for our columns
new_labels

['id',
 'diagnosis',
 'radius',
 'texture',
 'perimeter',
 'area',
 'smoothness',
 'compactness',
 'concavity',
 'concave_points',
 'symmetry',
 'fractal_dimension']

In [12]:
# assign new labels to columns in dataframe
df.columns = new_labels

# display first few rows of dataframe to confirm changes
df.head()

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,fractal_dimension
0,842302,M,17.99,19.293431,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,0.096087,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [13]:
# save this for later
df.to_csv('./cancer_data_edited.csv', index=False)