EDIT OVERVIEW OF PROJECT

In [1]:
# Import packages needed for analysis of data
import pandas as pd
import plotly.express as px
import re
from matplotlib import pyplot as plt 
import numpy as np 
from scipy import stats 
import seaborn as sns

In [2]:
# Print info of DataFrame to get all attributes of table at once
DF = pd.read_csv("Life Expectancy Data.csv")
print(DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [3]:
# It is good practice to ensure all columns follow the same name structure, so we need to ensure they are all string, lowercase, and without any white spaces.
DF.columns = DF.columns.str.strip().str.lower()

In [4]:
# Ensure that columns are all strings, lower cases, and have no white spaces.
print(DF.columns)

Index(['country', 'year', 'status', 'life expectancy', 'adult mortality',
       'infant deaths', 'alcohol', 'percentage expenditure', 'hepatitis b',
       'measles', 'bmi', 'under-five deaths', 'polio', 'total expenditure',
       'diphtheria', 'hiv/aids', 'gdp', 'population', 'thinness  1-19 years',
       'thinness 5-9 years', 'income composition of resources', 'schooling'],
      dtype='object')


In [None]:
# Now that leading and trailing spaces have been removed, can rename all columns
DF = DF.rename(
    columns={
        'life expectancy': 'life_expect',
        'adult mortality': 'adult_mortal',
        'infant deaths': 'infant_death',
        'percentage expenditure': 'percent_expend',
        'hepatitis_b': 'hep_b',
        'measles_': 'measles',
        '_bmi_': 'bmi',
        'under-five deaths': 'under_five_deaths',
        'total expenditure': 'total_expend',
        'diptheria ': 'diptheria',
        '_hiv/aids': 'hiv/aids',
        'thinness  1-19 years' : 'thinness_1-19_yrs',
        'thinness 5-9 years': 'thinness_5-9_yrs',
        'income composition of resources': 'income_comp_resources'

    }
)

In [27]:
print(DF.columns)

Index(['country', 'year', 'status', 'life_expect', 'adult_mortal',
       'infant_death', 'alcohol', 'percent_expend', 'hepatitis b', 'measles',
       'bmi', 'under_five_deaths', 'polio', 'total_expend', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_1-19_yrs',
       'thinness_5-9_yrs', 'income_comp_resources', 'schooling'],
      dtype='object')


In [8]:
print(DF.isna().sum())
# Due to the enormous amount of data that is not missing from these columns, we will not drop any values as they are all significant, nor will we put 'N/A' or NaN as this may cause complications later when applying statistical analysis

country                              0
year                                 0
status                               0
life expectancy                     10
adult mortality                     10
infant deaths                        0
alcohol                            194
percentage expenditure               0
hepatitis b                        553
measles                              0
bmi                                 34
under-five deaths                    0
polio                               19
total expenditure                  226
diphtheria                          19
hiv/aids                             0
gdp                                448
population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
income composition of resources    167
schooling                          163
dtype: int64


In [9]:
print(DF['life_expect'].min())

KeyError: 'life_expect'

In [None]:
print(DF['life_expect'].max())

In [None]:
# Create dataframes for each year, analyze the data and then make an overall analysis considering all dataframes

In [None]:
# Create DF for the year 2000
DF_2000 = DF[DF['year'] == 2000]

# Print sample of DF_2000
print(DF_2000.sample(10))

In [None]:
# Do various predicting factors already chosen affect the life expectancy? 
# Analyze Biological Factors first


