# Experimentacion - AGREGANDO DATA EXTERNA

## Modelo - Ubicacion (Latitud y Longitud, Continente)

### Voy a experimentar con las caracterisiticas principales de los paises, como su ubicacion, su status, su capacidad de gasto. Todo ese tipo de atributos que lo encasillan en una etiqueta en particular. La zona donde se ubica un pais puede explicar varias cosas sobre el. Si pensamos en un pais ubicado en el hemisferio norte del mundo, es probable que demos con uno desarrollado. Mientras que si, por ejemplo, pensamos en un pais ubicado en Africa, es probable que su expectativa de vida sea menor a la media. Vamos a ver como esto se condice con los datos y el analisis de regresion.

### Importo todos los modulos, funciones y datasets que me son de ayuda para la experimentacion

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence as influence

def rmse(fitted, target):
    rmse = np.sqrt(mean_squared_error(target, fitted))
    return rmse

def adjusted_r2(p, fitted, target):
    n=len(target)
    return 1 - (1-r2_score(target, fitted)) * (n-1) / (n-p-1)

def standarized_residual_plot(predicted, actual):
    residuals = actual - predicted
    standarized_residuals = residuals/np.std(residuals)
    plt.scatter(predicted, standarized_residuals)
    plt.xlabel("Predicciones")
    plt.ylabel("Residuos standarizados")
    plt.hlines(0, xmin=min(predicted), xmax=max(predicted))
    plt.title("Residuos standarizados")
    return standarized_residuals

def ols_influence(lm):
    plt.figure(figsize=(30,30))
    sm.graphics.influence_plot(lm, alpha  = 0.05, criterion="cooks")
    plt.show()

    
def run_analysis(predictors, dataset):
    target = dataset['Life expectancy']
    p=len(predictors)
    features= dataset[predictors].to_numpy()
    if len(predictors)>1:
        for i in range(0, len(predictors)):
            print(f"VIF for {predictors[i]} {variance_inflation_factor(features, i)}")

    features = sm.add_constant(features)
    regr = sm.OLS(target, features).fit()
    fitted = regr.predict(features)
    print(regr.summary())
    standarized_residuals = standarized_residual_plot(fitted, target)
    ols_influence(regr)
    #sm.qqplot(regr.resid)
    plt.figure()
    plt.hist(regr.resid)
    plt.show()
    return {
        'r2':r2_score(target, fitted),
        'rmse':rmse(fitted, target),
        'adjusted':adjusted_r2(p, fitted, target),
    }

# Me da los residuos estandarizados
def residuos(predictors, dataset):
    target = dataset['Life expectancy']
    p=len(predictors)
    features= dataset[predictors].to_numpy()
    if len(predictors)>1:
        for i in range(0, len(predictors)):
            print(f"VIF for {predictors[i]} {variance_inflation_factor(features, i)}")

    features = sm.add_constant(features)
    regr = sm.OLS(target, features).fit()
    fitted = regr.predict(features)
    standarized_residuals = standarized_residual_plot(fitted, target)
    return np.array(standarized_residuals)


# Me da la distancia de Cook de cada pais
def cooksDist(predictors, dataset):
    target = dataset['Life expectancy']
    p=len(predictors)
    features= dataset[predictors].to_numpy()
    if len(predictors)>1:
        for i in range(0, len(predictors)):
            print(f"VIF for {predictors[i]} {variance_inflation_factor(features, i)}")

    features = sm.add_constant(features)
    regr = sm.OLS(target, features).fit()
    inf = influence(regr)
    # cooks_distance is an attribute of incluence, here C, not sure about P (p-value maybe?)
    C, P = inf.cooks_distance
    return C, P

# Me da el Leverage de cada pais
def getLeverage(predictors, dataset):
    target = dataset['Life expectancy']
    p=len(predictors)
    features= dataset[predictors].to_numpy()
    if len(predictors)>1:
        for i in range(0, len(predictors)):
            print(f"VIF for {predictors[i]} {variance_inflation_factor(features, i)}")

    features = sm.add_constant(features)
    regr = sm.OLS(target, features).fit()
    inf = influence(regr)
    leverage = inf.hat_matrix_diag
    return leverage


df_exp = pd.read_csv("expectativa_de_vida.csv")
df_coord = pd.read_csv("average-latitude-longitude-countries.csv")

ModuleNotFoundError: No module named 'statsmodels'

### Armo el data set con la ubicacion en el mapa de cada pais (como se realiza en el archivo 'Mapa.ipynb')

In [None]:
paises = []
nombres1 = df_exp["Country"].tolist()
nombres2 = df_coord["Country"].tolist()

from difflib import SequenceMatcher

for i in range(len(nombres1)):
    n1 = nombres1[i]
    
    conSubstringsEnComun = []
    
    for j in range(len(nombres2)):
        n2 = nombres2[j]
        seq = SequenceMatcher(None, n1, n2)
        match = seq.find_longest_match(0,len(n1),0,len(n2))
        r = seq.ratio()
        
        if ((match[2] == len(n1)) or (match[2] == len(n2)) or (match[2] > 3 and r>0.6)):
            conSubstringsEnComun.append([r,n1,n2])
        
    conSubstringsEnComun.sort()
    conSubstringsEnComun.reverse()
    
    
    if (len(conSubstringsEnComun) > 0):
        paises.append([conSubstringsEnComun[0][1], conSubstringsEnComun[0][2]])
    else:
        paises.append(n1)

In [None]:
df = df_exp
latitudes = []
longitudes = []

for i in range(len(paises)):
    lat = df_coord[df_coord["Country"]==paises[i][1]]["Latitude"].values
    long = df_coord[df_coord["Country"]==paises[i][1]]["Longitude"].values
    if len(lat) > 0:
        lat = lat[0]
        long = long[0]
    else:
        lat = 0
        long = 0
    latitudes.append(lat)
    longitudes.append(long)

df["Latitude"] = latitudes
df["Longitude"] = longitudes

In [None]:
df['Status_Float'] = df['Status'].apply(lambda x: 0 if x=='Developing' else 1)


In [None]:
df[(df["Life expectancy"] > 80) & (df["Status"] == "Developing")]

In [None]:
df[(df["Life expectancy"] < 75) & (df["Status"] == "Developed")]

## Seteo correctamente el continente a un par de paises mal categorizados

In [None]:
df.loc[30, 'Status_Float'] = 1 #Cnada
df.loc[57, 'Status_Float'] = 1 #Finland
df.loc[58, 'Status_Float'] = 1 #France
df.loc[64, 'Status_Float'] = 1 #Greece
df.loc[79, 'Status_Float'] = 1 #Israel
df.loc[130, 'Status_Float'] = 1 #Republic of Korea
df.loc[24, 'Status_Float'] = 0 #Bulgaria
df.loc[72, 'Status_Float'] = 0 #Hungría
df.loc[95, 'Status_Float'] = 0 #Lithuania
df.loc[132, 'Status_Float'] = 0 #Rumanía
df.loc[145, 'Status_Float'] = 0 #Eslovaquia


In [None]:
df.info()


### Ahora agrego la zona donde se encuentra cada Estado

In [None]:
df_zona = pd.read_csv("country-and-continent-codes-list.csv")

In [None]:
df_zona.info()

### Equivalencia entre paises

In [5]:
paises = []
nombres1 = df["Country"].tolist()
nombres2 = df_zona["Country_Name"].tolist()

for i in range(len(nombres1)):
    n1 = nombres1[i]
    
    conSubstringsEnComun = []
    
    for j in range(len(nombres2)):
        n2 = nombres2[j]
        seq = SequenceMatcher(None, n1, n2)
        match = seq.find_longest_match(0,len(n1),0,len(n2))
        
        """
        if ((match[2] == len(n1)) or (match[2] == len(n2))):
            r = seq.ratio()
            conSubstringsEnComun.append([r,n1,n2])
        """
        r = seq.ratio()
        
        if ((match[2] == len(n1)) or (match[2] == len(n2)) or (match[2] > 3 and r>0.6)):
            conSubstringsEnComun.append([r,n1,n2])
        
    conSubstringsEnComun.sort()
    conSubstringsEnComun.reverse()
    
    
    if (len(conSubstringsEnComun) > 0):
        paises.append([conSubstringsEnComun[0][1], conSubstringsEnComun[0][2]])
    else:
        paises.append(n1)

NameError: name 'df' is not defined

In [None]:
df_new = df
zonas = []

for i in range(len(paises)):
    zone = df_zona[df_zona["Country_Name"]==paises[i][1]]["Continent_Name"].values
    if len(zone) > 0:
        zone = zone[0]
    else:
        zone = 'null'
    zonas.append(zone)

df_new["Continente"] = zonas

In [None]:
df_new["Continente"]

### Veo aquellos paises que quedaron sin zona y los completamos

In [None]:
df_new[df_new["Continente"] == 'null']

### Les asigno sus respectivos continentes

In [None]:
df_new.loc[19, 'Continente'] = 'South America' #Bolivia
df_new.loc[27, 'Continente'] = 'Africa' #Cabo Verde
df_new.loc[42, 'Continente'] = 'Europe' #Czechia
df_new.loc[88, 'Continente'] = 'Asia' # Kyrgyzstan
df_new.loc[179, 'Continente'] = 'Asia' #Viet Nam

In [6]:
df_new[df_new['Country'] == 'Bolivia (Plurinational State of)']['Continente'] 

NameError: name 'df_new' is not defined

## Convierto en variables dummie 

In [None]:
dummies = pd.get_dummies(df_new.Continente)
print(dummies)
df_new = df_new.join(dummies)


### Sabiendo las coordenadas de cada pais, analizamos como estas infieren en su expectativa de vida

In [None]:
run_analysis(['Latitude', 'Longitude'], df_new)

### Por lo pronto no hay un valor considerable en el ajuste. Y si anadimos su zona? Nos proveera mas informacion esto?

### Agregamos el continente de un pais

In [None]:
run_analysis(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America'], df_new)

### Ahora si, mejora considerablemente. Que pasa si agregamos el status de un pais tambien?

In [7]:
run_analysis(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Status_Float'], df_new)

NameError: name 'run_analysis' is not defined

## Calculo residuos

In [None]:
residuos_con_status = residuos(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Status_Float'], df_new)
residuos_sin_status = residuos(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America'], df_new)

In [None]:
plot = sns.distplot(residuos_sin_status)
plot.set(xlabel='Residuo standarizados', ylabel='Densidad')
fig = plot.get_figure()
fig.savefig('residuos_sin_status') 

In [None]:
plot = sns.distplot(residuos_con_status)
plot.set(xlabel='Residuo standarizados', ylabel='Densidad')
fig = plot.get_figure()
fig.savefig('residuos_con_status') 

### Genera una mejora. Finalizemos agregando el gasto por pais, esto seguro nos dara un mayor contexto. Mezclando no solo la geografia de los paises sino tambien las finanzas, estos atributos van de la mano con el 'Status' de un pais. Estaran tan correlacionados con la zona como este ultimo?

### Saco los paises con percentage expenditure que no consideramos validos

In [8]:
df_new = df_new[df_new["percentage expenditure"] <= 1000]

NameError: name 'df_new' is not defined

In [None]:
df_new = df_new[df_new['percentage expenditure'].notna()]
df_new[f'Normalizado_Gasto'] = (df_new['percentage expenditure']-np.mean(df_new['percentage expenditure']))/np.std(df_new['percentage expenditure'])

In [9]:
run_analysis(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto'], df_new)

NameError: name 'run_analysis' is not defined

In [None]:
residuos_con_pe = residuos(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto'], df_new)

In [None]:
plot = sns.distplot(residuos_con_pe)
plot.set(xlabel='Residuo standarizados', ylabel='Densidad')
fig = plot.get_figure()
fig.savefig('residuos_con_gasto') 

### Subio el ajuste, pero no tanto. Sera porque la zona de un pais y su gasto estan demasiado correlacionados?

## Saco outliers

### Residuos

In [None]:
residuos_out = residuos(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto'], df_new)

In [None]:
df_new_out = df_new.assign(residuos_out = residuos_out)
print()

#### Veamos cuales quedan afuera

In [10]:
display(df_new_out[(df_new_out["residuos_out"]**2)**0.5 > 2])

NameError: name 'df_new_out' is not defined

##### Los sacamos

In [None]:
df_new_out = df_new_out[(df_new_out["residuos_out"]**2)**0.5 <= 2]

### Distancia de Cook

In [None]:
cook_out, p_out = cooksDist(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto'], df_new_out)

In [11]:
df_new_out = df_new_out.assign(cook_out = cook_out)
display(df_new_out.sort_values('cook_out'))

NameError: name 'df_new_out' is not defined

In [None]:
display(df_new_out[df_new_out["cook_out"] > 0.05])

In [None]:
df_new_out = df_new_out[df_new_out["cook_out"] < 0.05]

### Leverage

In [None]:
leverage_out = getLeverage(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto',], df_new_out)

In [None]:
df_new_out = df_new_out.assign(leverage_out = leverage_out)

In [12]:
display(df_new_out[df_new_out["leverage_out"] > 0.11])

NameError: name 'df_new_out' is not defined

In [None]:
df_new_out = df_new_out[df_new_out["leverage_out"] < 0.11]

In [None]:
pd.concat([df_new_out,df_new]).drop_duplicates(keep=False)


### Rehago analisis

In [None]:
run_analysis(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto'], df_new_out)

In [None]:
residuos_sin_outliers = residuos(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Normalizado_Gasto'], df_new_out)

In [None]:
plot = sns.distplot(residuos_sin_outliers)
plot.set(xlabel='Residuo standarizados', ylabel='Densidad')
fig = plot.get_figure()
fig.savefig('residuos_sin_outliers') 

In [13]:
import metnum

lsq = metnum.LeastSquareMethod()

unos = np.ones(len(df_virus))
x = df_virus["enfermedades_all"].values.reshape(-1, 1)
b = df_virus["Life expectancy"].values.reshape(-1, 1)
A = np.column_stack((unos, x))

(intercept, slope) = lsq.ajustar(A, b)

puntos_x = np.linspace(-3.5,12,2)
puntos_y = puntos_x*slope + intercept
plt.plot(puntos_x, puntos_y)

ax_virus.set(xlabel='Virus', ylabel='Expectativa de vida')
plt.savefig("virus1.png", bbox_inches='tight')

NameError: name 'df_virus' is not defined

### Grafico regresion

### Dio muchos mejores valores