In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sms

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [4]:
from warnings import filterwarnings
filterwarnings("ignore")

Dataset provided from WHO, UN, World Bank

In [5]:
import os
# relative to `pwd`/data
datafile = os.path.join(os.getcwd(), "data", "life_expectancy_who_un.csv")
if not os.path.exists(datafile):
    print("Error")

Read the data into a Pandas dataframe, drop NaNs, make deep copies for stats processing and filtering

In [6]:
life_data = life_expectancy_data = pd.read_csv(datafile)

In [7]:
data = life_data.copy()
# drop NaN or fillna() with data.mean() inplace
data.dropna(inplace = True)

In [8]:
lindata = data.copy()
multidata = data.copy()
polydata = data.copy()
rf_data = data.copy()
logdata = data.copy()

Fit a line through scatter data via [Linear Regression](http://www.stat.yale.edu/Courses/1997-98/101/linreg.htm)

In [9]:
lindata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1649 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          1649 non-null   object 
 1   Year                             1649 non-null   int64  
 2   Status                           1649 non-null   object 
 3   Life expectancy                  1649 non-null   float64
 4   Adult Mortality                  1649 non-null   float64
 5   infant deaths                    1649 non-null   int64  
 6   Alcohol                          1649 non-null   float64
 7   percentage expenditure           1649 non-null   float64
 8   Hepatitis B                      1649 non-null   float64
 9   Measles                          1649 non-null   int64  
 10   BMI                             1649 non-null   float64
 11  under-five deaths                1649 non-null   int64  
 12  Polio               

In [10]:
lindata.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [11]:
lindata["Country"]

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
2933       Zimbabwe
2934       Zimbabwe
2935       Zimbabwe
2936       Zimbabwe
2937       Zimbabwe
Name: Country, Length: 1649, dtype: object

In [12]:
#pt_data = test[test["Country"] == "Sweden"]
#pt_data.head()

#for i in list(lindata):
    # show the list of values  
#    print(lindata[i].tolist())

In [13]:
df = lindata.copy()

In [14]:
df[df["Country"].str.match("Portugal")]

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
2056,Portugal,2014,Developed,89.0,78.0,0,9.88,271.254553,98.0,0,...,98.0,9.5,98.0,0.1,2277.53613,14162.0,0.7,0.5,0.837,16.8
2057,Portugal,2013,Developed,86.0,79.0,0,10.0,2698.01817,98.0,1,...,98.0,9.55,98.0,0.1,21618.73534,1457295.0,0.7,0.5,0.827,16.3
2058,Portugal,2012,Developed,83.0,81.0,0,11.96,331.457035,98.0,23,...,98.0,9.74,98.0,0.1,2577.4264,1514844.0,0.7,0.5,0.824,16.3
2059,Portugal,2011,Developed,82.0,85.0,0,11.92,3108.288623,97.0,2,...,97.0,1.7,97.0,0.1,23196.18375,155756.0,0.7,0.5,0.818,16.2
2060,Portugal,2010,Developed,79.6,88.0,0,12.25,3119.349824,97.0,5,...,97.0,1.44,98.0,0.1,22538.6548,15731.0,0.7,0.5,0.812,16.0
2061,Portugal,2009,Developed,79.3,9.0,0,12.03,337.102352,96.0,3,...,96.0,1.42,96.0,0.1,2363.97161,1568247.0,0.7,0.5,0.809,16.0
2062,Portugal,2008,Developed,79.0,92.0,0,12.35,3652.869332,97.0,1,...,97.0,9.9,97.0,0.1,24815.6884,1558177.0,0.7,0.5,0.804,15.8
2063,Portugal,2007,Developed,78.7,94.0,0,12.57,333.356912,97.0,0,...,96.0,9.62,97.0,0.1,2278.5845,1542964.0,0.7,0.5,0.797,15.4
2064,Portugal,2006,Developed,78.5,96.0,0,13.11,2884.020194,97.0,0,...,97.0,9.67,97.0,0.1,19821.44463,1522288.0,0.7,0.5,0.793,15.4
2065,Portugal,2005,Developed,77.7,11.0,0,13.33,2813.985285,94.0,7,...,93.0,9.98,93.0,0.1,18784.9485,15333.0,0.7,0.6,0.79,15.4


In [15]:
#help(df.to_sql)

Export to SQlite via SQLAlchemy

In [16]:
from sqlalchemy import create_engine

In [17]:
engine = create_engine("sqlite:///save_pandas.db", echo=True)
sqlite_connection = engine.connect()

In [20]:
sqlite_table = "Life_Expectancy_UN_WHO2"

In [21]:
df.to_sql(sqlite_table, sqlite_connection, if_exists="fail")

2021-11-17 11:36:00,145 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Life_Expectancy_UN_WHO2")
2021-11-17 11:36:00,147 INFO sqlalchemy.engine.Engine [raw sql] ()
2021-11-17 11:36:00,148 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("Life_Expectancy_UN_WHO2")
2021-11-17 11:36:00,148 INFO sqlalchemy.engine.Engine [raw sql] ()
2021-11-17 11:36:00,151 INFO sqlalchemy.engine.Engine 
CREATE TABLE "Life_Expectancy_UN_WHO2" (
	"index" BIGINT, 
	"Country" TEXT, 
	"Year" BIGINT, 
	"Status" TEXT, 
	"Life expectancy " FLOAT, 
	"Adult Mortality" FLOAT, 
	"infant deaths" BIGINT, 
	"Alcohol" FLOAT, 
	"percentage expenditure" FLOAT, 
	"Hepatitis B" FLOAT, 
	"Measles " BIGINT, 
	" BMI " FLOAT, 
	"under-five deaths " BIGINT, 
	"Polio" FLOAT, 
	"Total expenditure" FLOAT, 
	"Diphtheria " FLOAT, 
	" HIV/AIDS" FLOAT, 
	"GDP" FLOAT, 
	"Population" FLOAT, 
	" thinness  1-19 years" FLOAT, 
	" thinness 5-9 years" FLOAT, 
	"Income composition of resources" FLOAT, 
	"Schooling" FLOAT
)


2021-11-1

In [22]:
# close connection to DB
sqlite_connection.close()