## General methods

In [3]:
import numpy as np
import pandas as pd
import geopandas as gpd
import gdown
import datetime as dt
import sqlite3
import kaggle
import scipy.stats as sts
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pylab
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from math import sqrt

### Download Kaggle dataset

In [None]:
def kaggle_dataset_download(kaggle_path, kaggle_path_Name, kaggle_zip_file):
    kaggle.api.authenticate()
    if kaggle_zip_file:
        kaggle.api.dataset_download_files(kaggle_path, kaggle_path_Name, unzip=True)
    else:
        kaggle.api.dataset_download_files(kaggle_path, kaggle_path_Name)

### General Methods

In [None]:
def get_nulls_data(df):
    #We want to know the quality of data. So, let's start by detecting not null percentage related to every column. 
    
    df_tot_nulls = df.isnull().sum().sort_values(ascending=False)
    df_tot_nulls_perc = 100 - round(df_tot_nulls/len(df)*100,2)
    df_tot_perc_nulls = pd.concat([df_tot_nulls,df_tot_nulls_perc],axis=1)
    df_tot_perc_nulls = df_tot_perc_nulls.rename(columns={0: "Total", 1: "PercNotNull"})
    return df_tot_perc_nulls

In [None]:
def get_compound_acceptance_index(row):
    if row['Sentiment'] == 'Extremely Positive':
      return 1
    if row['Sentiment'] == 'Positive':
      return 0.5
    if row['Sentiment'] == 'Neutral':
      return 0
    if row['Sentiment'] == 'Negative':
      return -0.5
    if row['Sentiment'] == 'Extremely Negative':
      return -1
    return 0

In [None]:
def ttest_hypothesis_determination(pval, p_alpha):
    # Example: If alpha (significance) value is 0.05 or 5% it means 95% of confidence
    confidence_perc = 1 - p_alpha
    confidence_perc = 100 * confidence_perc
    
    p_alpha_perc = 100 * p_alpha
    
    str_H1 = "I have enough evidence to reject H0. Therefore, I assume H1 with a confidence of {0}% and significance of {1}%"
    str_H0 = "I don't have enough evidence to reject H0. So we accept is true with a confidence of {0}% and significance of {1}%"
    
    if pval < p_alpha:
       print(str_H1.format(confidence_perc,p_alpha_perc))
    else:
      print(str_H0.format(confidence_perc, p_alpha_perc))

In [None]:
def build_geodf(df, lat_col_name='latitude', lon_col_name='longitude'):
    df = df.copy()
    lat = df[lat_col_name]
    lon = df[lon_col_name]
    return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(lon, lat))

In [None]:
def get_clinical_dementia_rating(row):
    #https://www.sciencedirect.com/topics/medicine-and-dentistry/clinical-dementia-rating
    if row['CDR'] == 0:
      return "Normal"
    if row['CDR'] == 0.5:
      return "Very Mild Dementia"
    if row['CDR'] == 1:
      return "Mild Dementia"
    if row['CDR'] == 2:
      return "Moderate Dementia"
    if row['CDR'] == 3:
      return "Severe Dementia"
    if row['CDR'] == 4:
      return "Severe Dementia"
    if row['CDR'] == 5:
      return "Severe Dementia"
    return "Normal"

In [None]:
def chi_square_hypothesis_determination(p_alpha, p_chi2, p_dof, pval, p_critical_value):
    # Example: If alpha (significance) value is 0.05 or 5% it means 95% of confidence
    confidence_perc = 1 - p_alpha
    confidence_perc = 100 * confidence_perc
    
    p_alpha_perc = 100 * p_alpha
    
    str_H1 = "I have enough evidence to reject H0 (There is a relationship between the categorical variables). Therefore, I assume H1 with a confidence of {0}%, significance of {1}%, statistic of {2}, degree of freedom of {3}, p_value of {4} and  crital value of {5}."
    str_H0 = "I don't have enough evidence to reject H0 (There is no relationship between 2 categorical variables). So we accept is true with a confidence of {0}%, significance of {1}%, statistic of {2}, degree of freedom of {3}, p_value of {4} and  crital value of {5}."
    
    print("Validation_1:\n")
    if abs(chi2)>=critical_value:
        print("Validating chi2>=critical_value: " + str_H1.format(confidence_perc,p_alpha_perc, p_chi2, p_dof, p_value, p_critical_value))
    else:
        print("Validating chi2>=critical_value: " + str_H0.format(confidence_perc,p_alpha_perc, p_chi2, p_dof, p_value, p_critical_value))

    print("\nValidation_2:\n")
    if pval<=alpha:
        print("Validating pval<=alpha: " + str_H1.format(confidence_perc,p_alpha_perc, p_chi2, p_dof, p_value, p_critical_value))
    else:
        print("Validating pval<=alpha: " + str_H0.format(confidence_perc,p_alpha_perc, p_chi2, p_dof, p_value, p_critical_value))

In [None]:
def datasetColsExplanation():
    # The explanation of the cols was taken from: https://www.kaggle.com/data/60603
    extra_explanation = ("Country --> Country",
                         "Year --> Year",
                        "Status --> Developed or Developing status",
                        "Life expectancy --> Life Expectancy in age",
                        "Adult Mortality --> Adult Mortality Rates of both sexes (probability of dying between 15 and 60 years per 1000 population)",
                        "infant deaths --> Number of Infant Deaths per 1000 population",
                        "Alcohol --> Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol)",
                        "percentage expenditure -- Expenditure on health as a percentage of Gross Domestic Product per capita(%)",
                        "Hepatitis B --> Hepatitis B (HepB) immunization coverage among 1-year-olds (%)",
                        "Measles --> Measles - number of reported cases per 1000 population",
                        "BMI --> Average Body Mass Index of entire population",
                        "under-five deaths --> Number of under-five deaths per 1000 population",
                        "Polio --> Polio (Pol3) immunization coverage among 1-year-olds (%)",
                        "Total expenditure --> General government expenditure on health as a percentage of total government expenditure (%)",
                        "Diphtheria --> Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds (%)",
                        "HIV/AIDS --> Deaths per 1 000 live births HIV/AIDS (0-4 years)",
                        "GDP --> Gross Domestic Product per capita (in USD)",
                        "Population --> Population of the country",
                        "thinness 1-19 years --> Prevalence of thinness among children and adolescents for Age 10 to 19 (% )",
                        "thinness 5-9 years --> Prevalence of thinness among children for Age 5 to 9(%)",
                        "Income composition of resources --> Human Development Index in terms of income composition of resources (index ranging from 0 to 1)",
                        "Schooling --> Number of years of Schooling(years)"
                        )
    return extra_explanation

In [None]:
def fixing_col_nulls(df):
    for label,content in df.items():
        if pd.isnull(content).sum():
            df[label] = content.fillna(content.median())
    return df

In [None]:
def print_sklean_eval_metrics(Y_train, Y_train_pred, Y_test, Y_test_pred):
    print("Training set performance")
    print("MAE:", mean_absolute_error(Y_train, Y_train_pred))
    print("MSE:", mean_squared_error(Y_train, Y_train_pred))
    print("RMSE:", sqrt(mean_squared_error(Y_train, Y_train_pred)))
    print("RMSLE:", mean_squared_log_error(Y_train, Y_train_pred))
    print("R2 Score:", r2_score(Y_train, Y_train_pred))
    
    print("\n")    
    print("Testing set performance")
    print("MAE:", mean_absolute_error(Y_test, Y_test_pred))
    print("MSE:", mean_squared_error(Y_test, Y_test_pred))
    print("RMSE:", sqrt(mean_squared_error(Y_test, Y_test_pred)))
    print("RMSLE:", mean_squared_log_error(Y_test, Y_test_pred))
    print("R2 Score:", r2_score(Y_test, Y_test_pred))  