EDIT OVERVIEW OF PROJECT

In [29]:
# Import packages needed for analysis of data
import pandas as pd
import plotly.express as px
import re
from matplotlib import pyplot as plt 
import numpy as np 
from scipy import stats 
import seaborn as sns

In [30]:
# Print info of DataFrame to get all attributes of table at once
DF = pd.read_csv("Life Expectancy Data.csv")
print(DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [31]:
# It is good practice to ensure all columns follow the same name structure, so we need to ensure they are all string, lowercase, and without any white spaces.
DF.columns = DF.columns.str.strip().str.lower()

In [32]:
# Ensure that columns are all strings, lower cases, and have no white spaces.
print(DF.columns)

Index(['country', 'year', 'status', 'life expectancy', 'adult mortality',
       'infant deaths', 'alcohol', 'percentage expenditure', 'hepatitis b',
       'measles', 'bmi', 'under-five deaths', 'polio', 'total expenditure',
       'diphtheria', 'hiv/aids', 'gdp', 'population', 'thinness  1-19 years',
       'thinness 5-9 years', 'income composition of resources', 'schooling'],
      dtype='object')


In [33]:
# Now that leading and trailing spaces have been removed, can rename all columns
DF = DF.rename(
    columns={
        'life expectancy': 'life_expect',
        'adult mortality': 'adult_mortal',
        'infant deaths': 'infant_death',
        'percentage expenditure': 'percent_expend',
        'hepatitis_b': 'hep_b',
        'measles_': 'measles',
        '_bmi_': 'bmi',
        'under-five deaths': 'under_five_deaths',
        'total expenditure': 'total_expend',
        'diptheria ': 'diptheria',
        '_hiv/aids': 'hiv/aids',
        'thinness  1-19 years' : 'thinness_1-19_yrs',
        'thinness 5-9 years': 'thinness_5-9_yrs',
        'income composition of resources': 'income_comp_resources'

    }
)

In [34]:
print(DF.columns)

Index(['country', 'year', 'status', 'life_expect', 'adult_mortal',
       'infant_death', 'alcohol', 'percent_expend', 'hepatitis b', 'measles',
       'bmi', 'under_five_deaths', 'polio', 'total_expend', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_1-19_yrs',
       'thinness_5-9_yrs', 'income_comp_resources', 'schooling'],
      dtype='object')


In [35]:
print(DF.isna().sum())
# Due to the enormous amount of data that is not missing from these columns, we will not drop any values as they is a significant amount of data provided, nor will we put 'N/A' or NaN as this may cause complications later when applying statistical analysis

country                    0
year                       0
status                     0
life_expect               10
adult_mortal              10
infant_death               0
alcohol                  194
percent_expend             0
hepatitis b              553
measles                    0
bmi                       34
under_five_deaths          0
polio                     19
total_expend             226
diphtheria                19
hiv/aids                   0
gdp                      448
population               652
thinness_1-19_yrs         34
thinness_5-9_yrs          34
income_comp_resources    167
schooling                163
dtype: int64


In [None]:
# Examine if there are duplicates
print(DF.duplicated().count())

2938


In [None]:
# These results make sense seeing as the time period is from 2000-2015 and look at each country as an individual entry for each particular year. We will not drop any of these duplicates as that is not necessary and would hinder an accurate assessment of data.

In [38]:
# Create dataframes for each year, analyze the data and then make an overall analysis considering all dataframes

In [44]:
# Create DF for the year 2000
DF_2000 = DF[DF['year'] == 2000]

# Print sample of DF_2000
print(DF_2000.sample(10))

                               country  year      status  life_expect  \
2809          United States of America  2000   Developed         76.8   
2424                       South Sudan  2000  Developing         48.9   
1892                             Niger  2000  Developing         50.0   
63                              Angola  2000  Developing         45.3   
255                            Belgium  2000   Developed         77.6   
383                  Brunei Darussalam  2000  Developing         74.4   
736   Democratic Republic of the Congo  2000  Developing         51.3   
1795                           Myanmar  2000  Developing         62.1   
2648                             Tonga  2000  Developing         71.6   
495                           Cameroon  2000  Developing         51.4   

      adult_mortal  infant_death  alcohol  percent_expend  hepatitis b  \
2809         114.0            28     8.21        0.000000          9.0   
2424          38.0            31      NaN       

In [None]:
# After creating the new DF_2000, the column names have reverted back to their originals. To keep consistency for all DFs created, we will rename them so as to be able to refer to one singular name for all.
DF_2000 = DF.rename(
    columns={
        'life expectancy': 'life_expect',
        'adult mortality': 'adult_mortal',
        'infant deaths': 'infant_death',
        'percentage expenditure': 'percent_expend',
        'hepatitis_b': 'hep_b',
        'measles_': 'measles',
        '_bmi_': 'bmi',
        'under-five deaths': 'under_five_deaths',
        'total expenditure': 'total_expend',
        'diptheria ': 'diptheria',
        '_hiv/aids': 'hiv/aids',
        'thinness  1-19 years' : 'thinness_1-19_yrs',
        'thinness 5-9 years': 'thinness_5-9_yrs',
        'income composition of resources': 'income_comp_resources'

    }
)

print(DF_2000.columns)

Index(['country', 'year', 'status', 'life_expect', 'adult_mortal',
       'infant_death', 'alcohol', 'percent_expend', 'hepatitis b', 'measles',
       'bmi', 'under_five_deaths', 'polio', 'total_expend', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_1-19_yrs',
       'thinness_5-9_yrs', 'income_comp_resources', 'schooling'],
      dtype='object')


In [None]:
# Look at how various predicting factors, already chosen, affect the life expectancy; then determine the most influential factors on the life expectancy for this chosen time period.
# Analyze Biological Factors first, these include: alcohol, hep_b, measles, polio, diptheria, hiv/aids, both categories for thinness, as well as bmi
# Assess these biological factors against the adult_mortal and infant_death columns, and determine if there is a positive or negative correlation. 


In [None]:
# How does consumption of alochol per capita (liters of pure alcohol) impact infant deaths, adult mortality, and the overall life expectancy