In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import datetime as dt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [2]:
data = pd.read_csv('Big_Cities_Health_Data_Inventory.csv')

In [3]:
data.shape


(13512, 11)

In [4]:
data=data.drop_duplicates()


In [5]:
data.shape

(13511, 11)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13511 entries, 0 to 13511
Data columns (total 11 columns):
Indicator Category            13511 non-null object
Indicator                     13511 non-null object
Year                          13511 non-null object
Gender                        13511 non-null object
Race/ Ethnicity               13511 non-null object
Value                         13498 non-null float64
Place                         13511 non-null object
BCHC Requested Methodology    13004 non-null object
Source                        11221 non-null object
Methods                       4232 non-null object
Notes                         3541 non-null object
dtypes: float64(1), object(10)
memory usage: 1.2+ MB


In [7]:
data.isna().sum()

Indicator Category               0
Indicator                        0
Year                             0
Gender                           0
Race/ Ethnicity                  0
Value                           13
Place                            0
BCHC Requested Methodology     507
Source                        2290
Methods                       9279
Notes                         9970
dtype: int64

In [8]:
data.head()

Unnamed: 0,Indicator Category,Indicator,Year,Gender,Race/ Ethnicity,Value,Place,BCHC Requested Methodology,Source,Methods,Notes
0,HIV/AIDS,"AIDS Diagnoses Rate (Per 100,000 people)",2013,Both,All,30.4,"Atlanta (Fulton County), GA","AIDS cases diagnosed in 2012, 2013, 2014 (as a...",Diagnoses numbers were obtained from the Georg...,,
1,HIV/AIDS,"AIDS Diagnoses Rate (Per 100,000 people)",2012,Both,All,39.6,"Atlanta (Fulton County), GA","AIDS cases diagnosed in 2012, 2013, 2014 (as a...",Diagnoses numbers were obtained from the Georg...,,
2,HIV/AIDS,"AIDS Diagnoses Rate (Per 100,000 people)",2011,Both,All,41.7,"Atlanta (Fulton County), GA","AIDS cases diagnosed in 2012, 2013, 2014 (as a...",Diagnoses numbers were obtained from the Georg...,,
3,Cancer,All Types of Cancer Mortality Rate (Age-Adjust...,2013,Male,All,195.8,"Atlanta (Fulton County), GA","2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",,
4,Cancer,All Types of Cancer Mortality Rate (Age-Adjust...,2013,Female,All,135.5,"Atlanta (Fulton County), GA","2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",,


In [9]:
data['Indicator Category'].unique()

array(['HIV/AIDS', 'Cancer', 'Maternal and Child Health',
       'Life Expectancy and Death Rate (Overall)',
       'Nutrition, Physical Activity, & Obesity',
       'Behavioral Health/Substance Abuse', 'Injury and Violence',
       'Demographics', 'Infectious Disease', 'Tobacco', 'Food Safety'],
      dtype=object)

In [10]:
def validate_date(x):
    try:
        return 2019 - int(x)
    except:
        dlist=x.split("-")
        return int(dlist[1]) - int(dlist[0])

In [11]:
data['NO_Year'] = data['Year'].apply(lambda x:validate_date(x))

In [12]:
data['NO_Year'].unique()

array([ 6,  7,  8,  9,  2,  5,  4, 10,  1], dtype=int64)

In [13]:
data['Gender'].unique()

array(['Both', 'Male', 'Female'], dtype=object)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13511 entries, 0 to 13511
Data columns (total 12 columns):
Indicator Category            13511 non-null object
Indicator                     13511 non-null object
Year                          13511 non-null object
Gender                        13511 non-null object
Race/ Ethnicity               13511 non-null object
Value                         13498 non-null float64
Place                         13511 non-null object
BCHC Requested Methodology    13004 non-null object
Source                        11221 non-null object
Methods                       4232 non-null object
Notes                         3541 non-null object
NO_Year                       13511 non-null int64
dtypes: float64(1), int64(1), object(10)
memory usage: 1.3+ MB


In [15]:
data['Race/ Ethnicity'].unique()

array(['All', 'Black', 'White', 'Asian/PI', 'Hispanic', 'Multiracial',
       'Other', 'American Indian/Alaska Native', 'Native American'],
      dtype=object)

In [16]:
data['Place'].unique()

array(['Atlanta (Fulton County), GA', 'Cleveland, OH', 'Baltimore, MD',
       'Boston, MA', 'Portland (Multnomah County), OR', 'Chicago, IL',
       'San Diego County, CA', 'Dallas, TX', 'Denver, CO', 'Detroit, MI',
       'Kansas City, MO', 'Fort Worth (Tarrant County), TX',
       'Houston, TX', 'Seattle, WA', 'Washington, DC', 'Los Angeles, CA',
       'Las Vegas (Clark County), NV', 'Miami (Miami-Dade County), FL',
       'San Jose, CA', 'Minneapolis, MN', 'New York, NY',
       'Philadelphia, PA', 'Oakland, CA', 'Phoenix, AZ', 'Sacramento, CA',
       'San Antonio, TX', 'San Francisco, CA', 'U.S. Total',
       'Long Beach, CA'], dtype=object)

In [17]:
data['BCHC Requested Methodology'].fillna(data['BCHC Requested Methodology'].mode()[0],inplace = True)

In [18]:
data['BCHC Requested Methodology'].isna().count()

13511

In [19]:
data['Source'].isna().count()

13511

In [20]:
 date ='2003-2012'

In [21]:
data['Methods'].isna().count()

13511

In [22]:
data['Notes'].isna().count()

13511

In [23]:
data=pd.DataFrame(data)

In [24]:
data['Place'].apply(lambda x: x[:5]).unique()

array(['Atlan', 'Cleve', 'Balti', 'Bosto', 'Portl', 'Chica', 'San D',
       'Dalla', 'Denve', 'Detro', 'Kansa', 'Fort ', 'Houst', 'Seatt',
       'Washi', 'Los A', 'Las V', 'Miami', 'San J', 'Minne', 'New Y',
       'Phila', 'Oakla', 'Phoen', 'Sacra', 'San A', 'San F', 'U.S. ',
       'Long '], dtype=object)

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13511 entries, 0 to 13511
Data columns (total 12 columns):
Indicator Category            13511 non-null object
Indicator                     13511 non-null object
Year                          13511 non-null object
Gender                        13511 non-null object
Race/ Ethnicity               13511 non-null object
Value                         13498 non-null float64
Place                         13511 non-null object
BCHC Requested Methodology    13511 non-null object
Source                        11221 non-null object
Methods                       4232 non-null object
Notes                         3541 non-null object
NO_Year                       13511 non-null int64
dtypes: float64(1), int64(1), object(10)
memory usage: 1.3+ MB


In [26]:
data_preprocessed = data[data['Value'].notna()]

In [27]:
data_preprocessed.shape

(13498, 12)

In [28]:
catagorical_value = data_preprocessed[['Indicator Category','Gender','Race/ Ethnicity','Place','Indicator','BCHC Requested Methodology','Source','Year']]

In [29]:
catagorical_value

Unnamed: 0,Indicator Category,Gender,Race/ Ethnicity,Place,Indicator,BCHC Requested Methodology,Source,Year
0,HIV/AIDS,Both,All,"Atlanta (Fulton County), GA","AIDS Diagnoses Rate (Per 100,000 people)","AIDS cases diagnosed in 2012, 2013, 2014 (as a...",Diagnoses numbers were obtained from the Georg...,2013
1,HIV/AIDS,Both,All,"Atlanta (Fulton County), GA","AIDS Diagnoses Rate (Per 100,000 people)","AIDS cases diagnosed in 2012, 2013, 2014 (as a...",Diagnoses numbers were obtained from the Georg...,2012
2,HIV/AIDS,Both,All,"Atlanta (Fulton County), GA","AIDS Diagnoses Rate (Per 100,000 people)","AIDS cases diagnosed in 2012, 2013, 2014 (as a...",Diagnoses numbers were obtained from the Georg...,2011
3,Cancer,Male,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2013
4,Cancer,Female,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2013
5,Cancer,Both,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2013
6,Cancer,Male,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2012
7,Cancer,Female,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2012
8,Cancer,Both,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2012
9,Cancer,Male,All,"Atlanta (Fulton County), GA",All Types of Cancer Mortality Rate (Age-Adjust...,"2012, 2013, 2014; per 100,000 population using...","National Center for Health Statistics (NCHS), CDC",2011


In [30]:
one_hot_encoding = pd.get_dummies(catagorical_value)

In [31]:
one_hot_encoding

Unnamed: 0,Indicator Category_Behavioral Health/Substance Abuse,Indicator Category_Cancer,Indicator Category_Demographics,Indicator Category_Food Safety,Indicator Category_HIV/AIDS,Indicator Category_Infectious Disease,Indicator Category_Injury and Violence,Indicator Category_Life Expectancy and Death Rate (Overall),Indicator Category_Maternal and Child Health,"Indicator Category_Nutrition, Physical Activity, & Obesity",...,Year_2007-2012,Year_2008-2012,Year_2010,Year_2011,Year_2011-2012,Year_2011-2013,Year_2012,Year_2013,Year_2014,Year_2015
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [32]:
x = pd.concat([one_hot_encoding,data_preprocessed['NO_Year']],axis="columns")

In [33]:
x.shape

(13498, 530)

In [34]:
y = data_preprocessed['Value']

In [35]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [36]:
def model_fit(x_train,x_test,y_train,y_test,model):
    model.fit(x_train,y_train);
    y_train_predicated=model.predict(x_train);
    model.fit(x_test,y_test);
    y_test_predicated=model.predict(x_test)
    print("Mean Square Error : train :  ",mean_squared_error(y_train,y_train_predicated))
    print("Mean Square Error : test :  ",mean_squared_error(y_test,y_test_predicated))
    print("Root Mean Square Error : train :  ",np.sqrt(mean_squared_error(y_train,y_train_predicated)))
    print("Root Mean Square Error : test :  ",np.sqrt(mean_squared_error(y_test,y_test_predicated)))


In [37]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=84)

In [38]:
x_train.shape

(9448, 530)

In [39]:
x_test.shape

(4050, 530)

In [40]:
x.shape

(13498, 530)

In [41]:
linear = LinearRegression()

In [42]:
model_fit(x_train,x_test,y_train,y_test,linear)

Mean Square Error : train :   554606.4102255107
Mean Square Error : test :   600703.1065703704
Root Mean Square Error : train :   744.7190142768685
Root Mean Square Error : test :   775.0503896975798


In [43]:
poly_reg = PolynomialFeatures(degree=2)
x_train_poly=poly_reg.fit_transform(x_train)
x_test_poly=poly_reg.fit_transform(x_test)
poly_linear = LinearRegression()
model_fit(x_train_poly,x_test_poly,y_train,y_test,poly_linear);

Mean Square Error : train :   8389.115569787518
Mean Square Error : test :   345.72146789308385
Root Mean Square Error : train :   91.59211521625384
Root Mean Square Error : test :   18.593586740946026
