## Function

In [2]:
#Importing the four main libraries
import numpy as np # for maths
import seaborn as sns # to augment matplotlib/visuals
import matplotlib.pyplot as plt # visuals
import pandas as pd # data

# We use train/test split for our data
from sklearn.model_selection import train_test_split

# Scaling
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# Modelling library : stats models
import statsmodels.api as sm # contains the model
import statsmodels.tools # contains the metrics


In [None]:
# Import test/train, feature engineering, robust scaler and linear regression model
import import_ipynb
import report

In [None]:
#Read in the data
df = pd.read_csv('Life Expectancy Data.csv')

In [None]:
#feature columns for sensitive data
feature_cols_sens = ['const', 'Region_Asia',
       'Region_Central America and Caribbean', 'Region_European Union',
       'Region_Middle East', 'Region_North America', 'Region_Oceania',
       'Region_Rest of Europe', 'Region_South America', 'Under_five_deaths',
       'Adult_mortality', 'Hepatitis_B', 'GDP_per_capita',
        'Schooling', 'Economy_status_Developed']

In [None]:
#basic feature columns
feature_cols = ['const',
       'Adult_mortality', 'GDP_per_capita', 'Economy_status_Developed'] 

In [None]:
# Life Expectancy
def life_expectancy(features, coeff):
    features = features.sort_index()
    coeff = coeff.sort_index()
    # print(f'Ordered features: {features}')
    # print(f'Ordered coeff: {coeff}')
    combined = list(zip(features, coeff))
    # print(combined)
    prediction = sum(x * y for x, y in combined)
    return round(prediction, 2)

# Prompt user for input for each feature
sensitive = input('Do you consent to using advance population data, which may include protected data, for a more accurate model? Y/N').lower()
features = pd.Series()
if sensitive == 'y': 
    Region = input('What is your region' )
    features[Region] = 1
    features['Under_five_deaths'] = float(input('What is the rate of under five deaths?'))
    features['Adult_mortality'] = float(input('What is the rate of adult mortality?'))
    features['Hepatitis_B'] = float(input('What is the vaccine rate of hepatitis b?'))
    features['GDP_per_capita'] = float(input('What is the gdp per capita?'))
    features['Schooling'] = float(input('What is the average schooling?'))
    features['Economy_status_Developed'] = float(input('Is the country developed (1 for yes and 0 for no)?'))
    features_df = features.to_frame().T
    # so we are dealing with the columns regions OHE
    X_train_f = report.feature_eng(X_train_f)
    # Adding features to the end of the dataframe
    df_f= pd.concat([X_train_f, features_df], axis = 0)
    # Now we can scale our features
    df_f = report.scaler_rob(df_f)
    # now we only take the inputted features
    df_f = df_f.tail(1)
    # replace nan with null
    df_f = df_f.fillna(0)
    # use same columns
    df_f = df_f[feature_cols_sens]
    # Collapse dataframe into Series
    df_series = df_f.iloc[0]
    # print(df_series)
    X_train_fe = report.scaler_rob(X_train_f)
    results_sens = report.ols_lin(y_train_f, X_train_fe, feature_cols_sens)
    # print(results_sens.params)
    print(f'The average life expectancy for the more accurate model is: {life_expectancy(df_series, results_sens.params)}')
    
elif sensitive == 'n':
    features['Adult_mortality'] = float(input('What is the rate of adult mortality?'))
    features['GDP_per_capita'] = float(input('What is the gdp per capita?'))
    features['Economy_status_Developed'] = float(input('Is the country developed (1 for yes and 0 for no)?'))
    features_df = features.to_frame().T
    # so we are dealing with the columns regions OHE
    X_train_f = report.feature_eng(X_train_f)
    # Adding features to the end of the dataframe
    df_f= pd.concat([X_train_f, features_df], axis = 0)
    # Now we can scale our features
    df_f = report.scaler_rob(df_f)
    # now we only take the inputted features
    df_f = df_f.tail(1)
    # replace nan with null
    df_f = df_f.fillna(0)
    # use same columns
    df_f = df_f[feature_cols]
    # Collapse dataframe into Series
    df_series = df_f.iloc[0]
    # print(df_series)
    X_train_fe = report.scaler_rob(X_train_f)
    results = report.ols_lin(y_train_f, X_train_fe, feature_cols)
    # print(results.params)
    print(f'The average life expectancy for the base model is: {life_expectancy(df_series, results.params)}')

In [None]:
df[df['Economy_status_Developed'] == 1].head(3)