**Data Dictionary

State:the name of state

Cancer : Sites	the name of specific cancer

Year : the year that the number indicates

Sex	: male or female

Race :	the race of people, including American Indian or Alaska Native, Asian or Pacific Islander, Black or African American, White, and Other Races and Unknown combined

Count :	the number of cancer incidence

Population : the population of the state

Age-Adjusted Rate : "Age-adjusted rates are calculated with age distribution ratios from the Year 2000 ""standard million"" population, and the rates are shown per 100,000 population.

An age-adjusted rate is a weighted average of the age-specific (crude) rates, where the weights are the proportions of persons in the corresponding age groups of a standard million population. The potential confounding effect of age is reduced when comparing age-adjusted rates computed using the same standard million population.
"


In [1]:
import pandas as pd
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [2]:
data = pd.read_csv("cancer.csv")

# Data Exploration

In [3]:
# Print the first few entries of the dataset
display(data.head())

Unnamed: 0,States,Cancer Sites,Year,Sex,Race,Count,Population,Age-Adjusted Rate
0,Alabama,Thyroid,1999,Female,Black or African American,35.0,623475.0,6.3
1,Alabama,Thyroid,1999,Female,White,150.0,1640665.0,8.7
2,Alabama,Thyroid,1999,Male,White,58.0,1570643.0,3.6
3,Alabama,Thyroid,2000,Female,Black or African American,29.0,627439.0,4.9
4,Alabama,Thyroid,2000,Female,White,144.0,1643761.0,8.3


In [4]:
# Displaying last 2 rows
display(data.tail(2))

Unnamed: 0,States,Cancer Sites,Year,Sex,Race,Count,Population,Age-Adjusted Rate
19757,Wyoming,Melanoma of the Skin,2015,Female,White,71.0,270263.0,22.7
19758,Wyoming,Melanoma of the Skin,2015,Male,White,81.0,281674.0,26.4


### Taking the Features out with Count as the outcome

In [5]:
y = data['Count']

# Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names. 
X = data.drop('Count', axis = 1)

# Success
print("Cancer dataset has {} data points with {} variables each.".format(*data.shape))

Cancer dataset has 19759 data points with 8 variables each.


In [6]:
X

Unnamed: 0,States,Cancer Sites,Year,Sex,Race,Population,Age-Adjusted Rate
0,Alabama,Thyroid,1999,Female,Black or African American,623475.0,6.3
1,Alabama,Thyroid,1999,Female,White,1640665.0,8.7
2,Alabama,Thyroid,1999,Male,White,1570643.0,3.6
3,Alabama,Thyroid,2000,Female,Black or African American,627439.0,4.9
4,Alabama,Thyroid,2000,Female,White,1643761.0,8.3
...,...,...,...,...,...,...,...
19754,Wyoming,Melanoma of the Skin,2013,Male,White,279024.0,25.7
19755,Wyoming,Melanoma of the Skin,2014,Female,White,268844.0,21.5
19756,Wyoming,Melanoma of the Skin,2014,Male,White,279958.0,34.8
19757,Wyoming,Melanoma of the Skin,2015,Female,White,270263.0,22.7


##### Calculate Statistics

In [7]:
# Total number of records
n_records = data.shape[0]
print("Total number of records: {}".format(n_records))

Total number of records: 19759


In [8]:
# Split the data into features and target label
cancer_count = data['Count']
features_raw = data.drop('Count', axis = 1)

# Visualize skewed continuous features of original data
#vs.distribution(data)

In [9]:
# Max Cancer Patients
n_max_count = data.max().Count
print("Maximum number of Cancer Patients: {}".format(n_max_count))

Maximum number of Cancer Patients: 20662.0


In [10]:
# TODO: Minimum price of the data
minimum_count = np.min(data.Count) 

maximum_count = np.max(data.Count)

# TODO: Mean price of the data
mean_count = np.mean(data.Count)

# TODO: Median price of the data
median_count = np.median(data.Count)

# TODO: Standard deviation of prices of the data
std_count = np.std(data.Count)

# Show the calculated statistics
print("Statistics for cancer dataset:\n")
print("Minimum count: {}".format(minimum_count)) 
print("Maximum count: {}".format(maximum_count))
print("Mean count: {}".format(mean_count))
print("Median count {}".format(median_count))
print("Standard deviation of count: {}".format(std_count))

Statistics for cancer dataset:

Minimum count: 16.0
Maximum count: 20662.0
Mean count: 627.1274355989675
Median count 149.0
Standard deviation of count: 1347.8214449465163


In [11]:
#display(data.max())

### Data Pre Processing

##### Handling the missing Data

In [12]:
# Taking care of missing data
from sklearn import preprocessing

# view columns using df.columns
X.columns



Index(['States', 'Cancer Sites', 'Year', 'Sex', 'Race', 'Population',
       'Age-Adjusted Rate'],
      dtype='object')

In [13]:
# create a LabelEncoder object and fit it to each feature in X


# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
X_2 = X.apply(le.fit_transform)
X_2.head()
X_2.tail()

Unnamed: 0,States,Cancer Sites,Year,Sex,Race,Population,Age-Adjusted Rate
19754,50,3,14,1,4,1875,251
19755,50,3,15,0,4,1846,209
19756,50,3,15,1,4,1878,342
19757,50,3,16,0,4,1849,221
19758,50,3,16,1,4,1884,258


In [15]:
from sklearn.impute import SimpleImpute

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
imputer = imputer.fit(X[:, 5:7])
X[:, 5:7] = imputer.transform(X[:, 5:7])

ImportError: cannot import name 'SimpleImpute' from 'sklearn.impute' (C:\Users\nichaurasia\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\impute\__init__.py)

In [21]:
# create a OneHotEncoder object, and fit it to all of X

# 1. INSTANTIATE
enc = preprocessing.OneHotEncoder()

#X_2 = X_2.copy(order='C')
# 2. FIT
enc.fit(X_2)


# 3. Transform
onehotlabels = enc.transform(X_2,categories='auto').toarray()
onehotlabels.shape


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


TypeError: transform() got an unexpected keyword argument 'categories'

##### Encoding Categorical Variables

In [23]:
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
X = labelencoder_X.fit_transform(X)

onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()


ValueError: bad input shape (19759, 7)

In [None]:
X[:, 5]


nitin



In [25]:
# Split the data into features and target label
count_raw = data['Count']
features_raw = data.drop('Count', axis = 1)

# Visualize skewed continuous features of original data
#vs.distribution(data)


KeyError: 'income'

In [28]:
# The "cross_validation" name is now deprecated and was replaced by "model_selection" 
# inside the new anaconda versions

# from sklearn.cross_validation import train_test_split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))


Training set has 15807 samples.
Testing set has 3952 samples.


In [27]:
X_train, X_test, y_train, y_test 

(                     States                           Cancer Sites  Year  \
 18047              Illinois                   Melanoma of the Skin  2010   
 11774              New York                       Colon and Rectum  2007   
 3223               Colorado  Urinary Bladder, invasive and in situ  2008   
 11442                Nevada                       Colon and Rectum  2006   
 11917        North Carolina                       Colon and Rectum  2012   
 ...                     ...                                    ...   ...   
 13123         West Virginia                       Colon and Rectum  2009   
 19648         West Virginia                   Melanoma of the Skin  2000   
 9845   District of Columbia                       Colon and Rectum  2005   
 10799              Maryland                       Colon and Rectum  2012   
 2732             Washington                                Thyroid  2004   
 
           Sex                              Race  Population  Age-Adjusted

In [30]:
TP = np.sum(Count) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data encoded to numerical values done in the data preprocessing step.
FP = Count.count() - TP # Specific to the naive case

TN = 0 # No predicted negatives in the naive case
FN = 0 # No predicted negatives in the naive case

# TODO: Calculate accuracy, precision and recall
accuracy = TP/income.count()
recall = TP/(TP+FP)
precision = TP/(TP+FN)

# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
beta = 0.5
fscore = (1+(beta**2)) * ((precision*recall)/((precision*(beta**2))+recall))

# Print the results 
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

NameError: name 'Count' is not defined