# Import Libraries and Dataset 

In [1]:
# Import Library

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

In [2]:
# from the dataset unzip the dataset

dataset = 'archive.zip'
df = pd.read_csv(dataset, compression='zip', header=0, sep=',', quotechar='"')
df.head(10)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Total,Total,OVR,OVERALL
1,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Male,GEN,MALE
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Female,GEN,FEMALE
3,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,Less than high school,EDU,EDUHS
4,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,High school graduate,EDU,EDUHSGRAD
5,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,Some college or technical school,EDU,EDUCOTEC
6,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,College graduate,EDU,EDUCOGRAD
7,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Age (years),18 - 24,AGEYR,AGEYR1824
8,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
9,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Age (years),35 - 44,AGEYR,AGEYR3544


In [3]:
df.describe()

Unnamed: 0,YearStart,YearEnd,Data_Value_Unit,Data_Value,Data_Value_Alt,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,LocationID
count,53392.0,53392.0,0.0,48346.0,48346.0,48346.0,48346.0,48346.0,53392.0
mean,2013.281465,2013.281465,,31.156681,31.156681,26.892227,35.989997,3889.19286,30.282215
std,1.6933,1.6933,,10.247033,10.247033,10.038584,11.205813,19829.42129,16.821318
min,2011.0,2011.0,,0.9,0.9,0.3,3.0,50.0,1.0
25%,2012.0,2012.0,,24.1,24.1,20.0,28.2,566.0,17.0
50%,2013.0,2013.0,,30.7,30.7,26.45,35.6,1209.0,30.0
75%,2015.0,2015.0,,37.0,37.0,32.9,42.2,2519.0,44.0
max,2016.0,2016.0,,77.6,77.6,69.5,87.7,476876.0,78.0


# Data Preprocessing

In [4]:
#creating function to remove outliers by using interquartile function
def remove_outliers(df):
    for col in df.select_dtypes(include=np.number).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (2.5 * iqr)
        upper_bound = q3 + (2.5 * iqr)
        df.drop(df[(df[col] < lower_bound) | (df[col] > upper_bound)].index, inplace=True)
        

In [5]:
# check for null values
df.isnull().sum()

YearStart                         0
YearEnd                           0
LocationAbbr                      0
LocationDesc                      0
Datasource                        0
Class                             0
Topic                             0
Question                          0
Data_Value_Unit               53392
Data_Value_Type                   0
Data_Value                     5046
Data_Value_Alt                 5046
Data_Value_Footnote_Symbol    48346
Data_Value_Footnote           48346
Low_Confidence_Limit           5046
High_Confidence_Limit          5046
Sample_Size                    5046
Total                         51485
Age(years)                    41954
Education                     45764
Gender                        49578
Income                        40043
Race/Ethnicity                38136
GeoLocation                    1008
ClassID                           0
TopicID                           0
QuestionID                        0
DataValueTypeID             

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53392 entries, 0 to 53391
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   YearStart                   53392 non-null  int64  
 1   YearEnd                     53392 non-null  int64  
 2   LocationAbbr                53392 non-null  object 
 3   LocationDesc                53392 non-null  object 
 4   Datasource                  53392 non-null  object 
 5   Class                       53392 non-null  object 
 6   Topic                       53392 non-null  object 
 7   Question                    53392 non-null  object 
 8   Data_Value_Unit             0 non-null      float64
 9   Data_Value_Type             53392 non-null  object 
 10  Data_Value                  48346 non-null  float64
 11  Data_Value_Alt              48346 non-null  float64
 12  Data_Value_Footnote_Symbol  5046 non-null   object 
 13  Data_Value_Footnote         504

In [7]:
#topic unique values
df['Topic'].unique()

array(['Obesity / Weight Status', 'Fruits and Vegetables - Behavior',
       'Physical Activity - Behavior'], dtype=object)

In [8]:
# check for unique values in Class
df['Class'].unique()

array(['Obesity / Weight Status', 'Fruits and Vegetables',
       'Physical Activity'], dtype=object)

In [9]:
# check for unique values in Datasource
df['Datasource'].unique()

array(['Behavioral Risk Factor Surveillance System'], dtype=object)

In [10]:
# check for unique values in Data_Value_Unit
df['Data_Value_Unit'].unique()

array([nan])

In [11]:
# check for unique values in QuestionID
df['QuestionID'].unique()

array(['Q036', 'Q037', 'Q018', 'Q019', 'Q046', 'Q043', 'Q044', 'Q045',
       'Q047'], dtype=object)

In [12]:
# check for unique values in Data_Value_Footnote_Symbol
df['Data_Value_Footnote_Symbol'].unique()


array([nan, '~'], dtype=object)

In [13]:
# check for unique values in Data_Value_Footnote
df['Data_Value_Footnote'].unique()

array([nan, 'Data not available because sample size is insufficient.',
       'Data not available because sample size is insufficient.  If data only missing for the confidence interval, the confidence interval was not calculated.'],
      dtype=object)

In [14]:
# check for unique values in StratificationCategoryId1
df['StratificationCategoryId1'].unique()    



array(['OVR', 'GEN', 'EDU', 'AGEYR', 'INC', 'RACE'], dtype=object)

In [15]:
# check for unique values in StratificationID1
df['StratificationID1'].unique()

array(['OVERALL', 'MALE', 'FEMALE', 'EDUHS', 'EDUHSGRAD', 'EDUCOTEC',
       'EDUCOGRAD', 'AGEYR1824', 'AGEYR2534', 'AGEYR3544', 'AGEYR4554',
       'AGEYR5564', 'AGEYR65PLUS', 'INCLESS15', 'INC1525', 'INC2535',
       'INC3550', 'INC5075', 'INC75PLUS', 'INCNR', 'RACEWHT', 'RACEBLK',
       'RACEHIS', 'RACEASN', 'RACEHPI', 'RACENAA', 'RACE2PLUS', 'RACEOTH'],
      dtype=object)

In [16]:
df = df.drop(['Low_Confidence_Limit','High_Confidence_Limit ','YearEnd','Topic','Class','Datasource','Data_Value_Unit','QuestionID','ClassID','TopicID','DataValueTypeID','Data_Value_Type','Data_Value_Footnote_Symbol','Data_Value_Footnote','StratificationCategoryId1','StratificationID1'] , axis=1);

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53392 entries, 0 to 53391
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   YearStart                53392 non-null  int64  
 1   LocationAbbr             53392 non-null  object 
 2   LocationDesc             53392 non-null  object 
 3   Question                 53392 non-null  object 
 4   Data_Value               48346 non-null  float64
 5   Data_Value_Alt           48346 non-null  float64
 6   Sample_Size              48346 non-null  float64
 7   Total                    1907 non-null   object 
 8   Age(years)               11438 non-null  object 
 9   Education                7628 non-null   object 
 10  Gender                   3814 non-null   object 
 11  Income                   13349 non-null  object 
 12  Race/Ethnicity           15256 non-null  object 
 13  GeoLocation              52384 non-null  object 
 14  LocationID            

In [18]:
df.head(10)

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,32.0,32.0,7304.0,Total,,,,,,"(32.84057112200048, -86.63186076199969)",1,Total,Total
1,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,32.3,32.3,2581.0,,,,Male,,,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
2,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,31.8,31.8,4723.0,,,,Female,,,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
3,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,33.6,33.6,1153.0,,,Less than high school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Less than high school
4,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,32.8,32.8,2402.0,,,High school graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,High school graduate
5,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,33.8,33.8,1925.0,,,Some college or technical school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Some college or technical school
6,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,26.4,26.4,1812.0,,,College graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,College graduate
7,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,16.3,16.3,356.0,,18 - 24,,,,,"(32.84057112200048, -86.63186076199969)",1,Age (years),18 - 24
8,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,35.2,35.2,598.0,,25 - 34,,,,,"(32.84057112200048, -86.63186076199969)",1,Age (years),25 - 34
9,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,35.5,35.5,865.0,,35 - 44,,,,,"(32.84057112200048, -86.63186076199969)",1,Age (years),35 - 44


In [19]:
# Find Unique values for StratificationCategory1
df['StratificationCategory1'].unique()


array(['Total', 'Gender', 'Education', 'Age (years)', 'Income',
       'Race/Ethnicity'], dtype=object)

In [20]:
# Find Unique values for Stratification1
df['Stratification1'].unique()


array(['Total', 'Male', 'Female', 'Less than high school',
       'High school graduate', 'Some college or technical school',
       'College graduate', '18 - 24', '25 - 34', '35 - 44', '45 - 54',
       '55 - 64', '65 or older', 'Less than $15,000', '$15,000 - $24,999',
       '$25,000 - $34,999', '$35,000 - $49,999', '$50,000 - $74,999',
       '$75,000 or greater', 'Data not reported', 'Non-Hispanic White',
       'Non-Hispanic Black', 'Hispanic', 'Asian',
       'Hawaiian/Pacific Islander', 'American Indian/Alaska Native',
       '2 or more races', 'Other'], dtype=object)

In [21]:
# create dataframe based on gender and educationm and income

DataFrame_Gender = df[(df['Stratification1'] == 'Male') | (df['Stratification1']== 'Female')]
DataFrame_Education = df[(df['StratificationCategory1']=='Education')]
DataFrame_Income = df[(df['StratificationCategory1']=='Income')]

In [22]:
# reset index for all the dataframes

DataFrame_Gender = DataFrame_Gender.reset_index(drop=True)
DataFrame_Education = DataFrame_Education.reset_index(drop=True)
DataFrame_Income = DataFrame_Income.reset_index(drop=True)

In [23]:
DataFrame_Education.head(10)

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,33.6,33.6,1153.0,,,Less than high school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Less than high school
1,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,32.8,32.8,2402.0,,,High school graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,High school graduate
2,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,33.8,33.8,1925.0,,,Some college or technical school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Some college or technical school
3,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,26.4,26.4,1812.0,,,College graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,College graduate
4,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,33.2,33.2,1153.0,,,Less than high school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Less than high school
5,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,34.1,34.1,2402.0,,,High school graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,High school graduate
6,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,35.0,35.0,1925.0,,,Some college or technical school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Some college or technical school
7,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,36.8,36.8,1812.0,,,College graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,College graduate
8,2011,AL,Alabama,Percent of adults who report consuming fruit l...,36.1,36.1,1844.0,,,College graduate,,,,"(32.84057112200048, -86.63186076199969)",1,Education,College graduate
9,2011,AL,Alabama,Percent of adults who report consuming fruit l...,41.2,41.2,1912.0,,,Some college or technical school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Some college or technical school


In [24]:
DataFrame_Income.head(10)

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,38.5,38.5,1112.0,,,,,"Less than $15,000",,"(32.84057112200048, -86.63186076199969)",1,Income,"Less than $15,000"
1,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,34.8,34.8,1367.0,,,,,"$15,000 - $24,999",,"(32.84057112200048, -86.63186076199969)",1,Income,"$15,000 - $24,999"
2,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,35.8,35.8,757.0,,,,,"$25,000 - $34,999",,"(32.84057112200048, -86.63186076199969)",1,Income,"$25,000 - $34,999"
3,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,32.3,32.3,861.0,,,,,"$35,000 - $49,999",,"(32.84057112200048, -86.63186076199969)",1,Income,"$35,000 - $49,999"
4,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,34.1,34.1,785.0,,,,,"$50,000 - $74,999",,"(32.84057112200048, -86.63186076199969)",1,Income,"$50,000 - $74,999"
5,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,28.8,28.8,1125.0,,,,,"$75,000 or greater",,"(32.84057112200048, -86.63186076199969)",1,Income,"$75,000 or greater"
6,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,23.8,23.8,1297.0,,,,,Data not reported,,"(32.84057112200048, -86.63186076199969)",1,Income,Data not reported
7,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,28.4,28.4,1112.0,,,,,"Less than $15,000",,"(32.84057112200048, -86.63186076199969)",1,Income,"Less than $15,000"
8,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,31.9,31.9,1367.0,,,,,"$15,000 - $24,999",,"(32.84057112200048, -86.63186076199969)",1,Income,"$15,000 - $24,999"
9,2011,AL,Alabama,Percent of adults aged 18 years and older who ...,35.7,35.7,757.0,,,,,"$25,000 - $34,999",,"(32.84057112200048, -86.63186076199969)",1,Income,"$25,000 - $34,999"


In [25]:
# find unique values for LocationDesc
df['LocationDesc'].unique()

array(['Alabama', 'National', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Connecticut', 'Colorado', 'Delaware', 'Florida',
       'District of Columbia', 'Georgia', 'Guam', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virginia', 'West Virginia', 'Washington', 'Wisconsin', 'Wyoming',
       'Virgin Islands'], dtype=object)

In [26]:
# find unique value for Question
df['Question'].unique()

array(['Percent of adults aged 18 years and older who have obesity',
       'Percent of adults aged 18 years and older who have an overweight classification',
       'Percent of adults who report consuming fruit less than one time daily',
       'Percent of adults who report consuming vegetables less than one time daily',
       'Percent of adults who engage in muscle-strengthening activities on 2 or more days a week',
       'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)',
       'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic physical activity and engage in muscle-strengthening activities on 2 or more days a week',
       'Percent of adults who achieve at least 300 minutes a week of moderate-intensity aerobic physical activity 

In [27]:
Q = ['Percent of adults aged 18 years and older who have obesity','Percent of adults aged 18 years and older who have an overweight classification','Percent of adults who engage in no leisure-time physical activity']

DataFrame_Gender = DataFrame_Gender[DataFrame_Gender['Question'].apply(lambda x: x in Q)]

QD = {
    'Percent of adults aged 18 years and older who have obesity' : 0,
    'Percent of adults aged 18 years and older who have an overweight classification' : 1,
    'Percent of adults who engage in no leisure-time physical activity' : 2
}

locationDict = {
    'Alabama' : 0,
    'National' : 1,
    'Alaska' : 2,
    'Arizona' : 3,
    'Arkansas' : 4,
    'California' : 5,
    'Connecticut' : 6,
    'Colorado' : 7,
    'Delaware' : 8,
    'Florida' : 9,
    'District of Columbia' : 10,
    'Georgia' : 11,
    'Guam' : 12,
    'Hawaii' : 13,
    'Idaho' : 14,
    'Illinois' : 15,
    'Indiana' : 16,
    'Iowa' : 17,
    'Kansas' : 18,
    'Kentucky' : 19,
    'Louisiana' : 20,
    'Maine' : 21,
    'Maryland' : 22,
    'Massachusetts' : 23,
    'Michigan' : 24,
    'Minnesota' : 25,
    'Mississippi' : 26,
    'Missouri' : 27,
    'Montana' : 28,
    'Nebraska' : 29,
    'Nevada' : 30,
    'New Hampshire' : 31,
    'New Jersey' : 32,
    'New Mexico' : 33,
    'New York' : 34,
    'North Carolina' : 35,
    'North Dakota' : 36,
    'Ohio' : 37,
    'Oklahoma' : 38,
    'Oregon' : 39,
    'Pennsylvania' : 40,
    'Puerto Rico' : 41,
    'Rhode Island' : 42,
    'South Carolina' : 43,
    'South Dakota' : 44,
    'Tennessee' : 45,
    'Texas' : 46,
    'Utah' : 47,
    'Vermont' : 48,
    'Virginia' : 49,
    'West Virginia' : 50,
    'Washington' : 51,
    'Wisconsin' : 52,
    'Wyoming' : 53,
    'Virgin Islands' : 54
}



In [28]:
# First of all we will work with Dataframe_Gender

# Removing Null Columns
DataFrame_Gender = DataFrame_Gender.drop(['StratificationCategory1','Stratification1','Race/Ethnicity','Income','Education','Age(years)','Total','LocationAbbr','GeoLocation'],axis=1)


In [29]:
DataFrame_Gender.head(10)

Unnamed: 0,YearStart,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Gender,LocationID
0,2011,Alabama,Percent of adults aged 18 years and older who ...,32.3,32.3,2581.0,Male,1
1,2011,Alabama,Percent of adults aged 18 years and older who ...,31.8,31.8,4723.0,Female,1
2,2011,Alabama,Percent of adults aged 18 years and older who ...,39.0,39.0,2581.0,Male,1
3,2011,Alabama,Percent of adults aged 18 years and older who ...,30.5,30.5,4723.0,Female,1
14,2012,National,Percent of adults aged 18 years and older who ...,27.4,27.4,257190.0,Female,59
17,2011,Alabama,Percent of adults who engage in no leisure-tim...,29.6,29.6,2531.0,Male,1
18,2011,Alabama,Percent of adults who engage in no leisure-tim...,35.3,35.3,4939.0,Female,1
19,2012,Alabama,Percent of adults aged 18 years and older who ...,31.8,31.8,3016.0,Male,1
20,2012,Alabama,Percent of adults aged 18 years and older who ...,34.1,34.1,5635.0,Female,1
21,2012,Alabama,Percent of adults who engage in no leisure-tim...,30.4,30.4,5958.0,Female,1


In [30]:
DataFrame_Gender.shape

(1918, 8)

In [31]:
# Encode the categorical data
labelencoder = LabelEncoder()

# Male = 1
# Female = 0

DataFrame_Gender['Gender'] = labelencoder.fit_transform(DataFrame_Gender['Gender'])


dummy = pd.get_dummies(DataFrame_Gender['Question'] ,dtype=int)
DataFrame_Gender =  pd.concat([DataFrame_Gender,dummy],axis=1)
del dummy
dummyL = pd.get_dummies(DataFrame_Gender['LocationDesc'],dtype=int)
dummyL
DataFrame_Gender = pd.concat([DataFrame_Gender,dummyL],axis=1)
del dummyL

DataFrame_Gender

Unnamed: 0,YearStart,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Gender,LocationID,Percent of adults aged 18 years and older who have an overweight classification,Percent of adults aged 18 years and older who have obesity,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2011,Alabama,Percent of adults aged 18 years and older who ...,32.3,32.3,2581.0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2011,Alabama,Percent of adults aged 18 years and older who ...,31.8,31.8,4723.0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2011,Alabama,Percent of adults aged 18 years and older who ...,39.0,39.0,2581.0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,Alabama,Percent of adults aged 18 years and older who ...,30.5,30.5,4723.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
14,2012,National,Percent of adults aged 18 years and older who ...,27.4,27.4,257190.0,0,59,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,2016,Guam,Percent of adults who engage in no leisure-tim...,34.5,34.5,877.0,0,66,0,0,...,0,0,0,0,0,0,0,0,0,0
3810,2016,Puerto Rico,Percent of adults who engage in no leisure-tim...,34.5,34.5,2160.0,1,72,0,0,...,0,0,0,0,0,0,0,0,0,0
3811,2016,Puerto Rico,Percent of adults who engage in no leisure-tim...,48.1,48.1,3630.0,0,72,0,0,...,0,0,0,0,0,0,0,0,0,0
3812,2016,Virgin Islands,Percent of adults who engage in no leisure-tim...,23.5,23.5,483.0,1,78,0,0,...,0,0,0,0,1,0,0,0,0,0


In [32]:
DataFrame_Gender.drop(['Question','LocationDesc','Data_Value_Alt'],axis=1,inplace=True)
DataFrame_Gender

Unnamed: 0,YearStart,Data_Value,Sample_Size,Gender,LocationID,Percent of adults aged 18 years and older who have an overweight classification,Percent of adults aged 18 years and older who have obesity,Percent of adults who engage in no leisure-time physical activity,Alabama,Alaska,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2011,32.3,2581.0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,31.8,4723.0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2011,39.0,2581.0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,30.5,4723.0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
14,2012,27.4,257190.0,0,59,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,2016,34.5,877.0,0,66,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3810,2016,34.5,2160.0,1,72,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3811,2016,48.1,3630.0,0,72,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3812,2016,23.5,483.0,1,78,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [33]:
DataFrame_Gender.shape


(1918, 63)

In [34]:
# Remove Dublicate Rows and columns
print(DataFrame_Gender.duplicated().sum())
# No Duplicate value exist

0


In [35]:
DataFrame_Gender.isnull().sum()

YearStart        0
Data_Value       0
Sample_Size      0
Gender           0
LocationID       0
                ..
Virginia         0
Washington       0
West Virginia    0
Wisconsin        0
Wyoming          0
Length: 63, dtype: int64

In [36]:
DataFrame_Gender.head(10)


Unnamed: 0,YearStart,Data_Value,Sample_Size,Gender,LocationID,Percent of adults aged 18 years and older who have an overweight classification,Percent of adults aged 18 years and older who have obesity,Percent of adults who engage in no leisure-time physical activity,Alabama,Alaska,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2011,32.3,2581.0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,31.8,4723.0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2011,39.0,2581.0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,30.5,4723.0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
14,2012,27.4,257190.0,0,59,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,2011,29.6,2531.0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
18,2011,35.3,4939.0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
19,2012,31.8,3016.0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
20,2012,34.1,5635.0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21,2012,30.4,5958.0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Split the data into train and test for year start different value
y = DataFrame_Gender.pop('Data_Value')
x_train,x_test,y_train,y_test = train_test_split(
    DataFrame_Gender,y,test_size=0.2,random_state=0,stratify=DataFrame_Gender['YearStart']) 



In [38]:
# linear regression model


Model = LinearRegression()

Model.fit(x_train,y_train)


In [39]:
# predict the value
y_pred = Model.predict(x_test)


In [40]:
Model.score(x_test,y_test)

0.5701788629115581

In [41]:
# calculate the mean squared error
mse = mean_squared_error(y_test,y_pred)
print(mse)


18.356353689742985


In [42]:
# calculate the root mean squared error
rmse = np.sqrt(mse)
print(rmse)


4.2844315480286275


ValueError: continuous is not supported