In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
import math

df = pd.read_csv('water_potability.csv', sep=',')

# ph => 491 nans
# sulfate => 781 nans
# trihalomethanes => 162 nans

In [3]:
# Funciones

# Normalizacion
def normalizacion(data_frame):
    # Se agrego esto, sino se esta modificando el dataframe original, y no quiero eso. Se modifica en todas las funciones
    new_df = data_frame.copy() # <-
    columnas = new_df.columns.to_list()
    new_val = []
    for item in columnas[:-1]:
        max_value = new_df[item].max()
        for value in new_df[item]:   
            new_val.append(value/max_value)
        new_df[item] = new_val
        new_val.clear()
    return new_df

# Funcion de descarte
# Asigna a un nuevo dataframe, otro pero con los indices dropeados de las filas que cumplan la condicion pasada
# Ejemplo: Si quiero armar un nuevo dataframe sin las filas donde la potabilidad es 0:
# df_1 = descarte(df, 'Potability', 1) => Esto se guarda todas las filas donde Potabilidad es 1
def descarte(data_frame, columna, dato):
    new_df = data_frame.copy()
    indexes = new_df[new_df[columna] != dato].index
    return new_df.drop(indexes)

# Cargadora de NaNs
def carga_nans(data_frame, data_ceros, data_unos):
    # Aca por las dudas hago lo mismo, igual se lo voy a asignar al original
    new_df = data_frame.copy()
    columnas = new_df.columns.to_list()
    # Esto devuelve:
    # index = indice de la fila
    # fila = todos los datos de la fila en un [[formato]]
    for index, fila in new_df.iterrows():
        # Y por cada fila del dataframe, itero en las columnas
        for i in range(9):
            # Cuando i = 0 > fila['ph'], cuando i = 1 > fila['Hardness']
            if math.isnan(fila.iloc[i]) and fila.iloc[9] == 0:
                new_df.loc[index, columnas[i]] = data_ceros[columnas[i]].median()
            elif math.isnan(fila.iloc[i]) and fila.iloc[9] == 1:
                new_df.loc[index, columnas[i]] = data_unos[columnas[i]].median()
    return new_df

# NUEVO! <- El anterior hacia cualquier cosa
def atipicos(valores_columna):
    ordered = sorted(valores_columna)
    n = len(valores_columna)
    Q1 = ordered[n // 4]
    Q2 = (ordered[n // 2 - 1] + ordered[n // 2]) / 2
    Q3 = ordered[3 * n // 4]
    iqr = Q3 - Q1
    # print('Max value: ', Q3 + (1.5 * iqr))
    # print('Min value: ', Q1 - (1.5 * iqr))
    # print('\n')
    # Entonces lo que quiero hacer es: Primero identificar los atipicos. Despues buscar esos atipicos en el dataframe, y eliminarlos. NO se hace con el indice xq son indices dis-
    # tintos, la columna no esta ordenada, 'ordered' si.
    values = []
    for value in ordered:
        if ((value > Q3 + (1.5 * iqr)) or (value < Q1 - (1.5 * iqr))):
            values.append(value)
    return values, iqr

def limpieza(data_frame):
    new_df = data_frame.copy()
    columnas = new_df.columns.to_list()
    for item in columnas[:1]:
        indices = []
        valores_at = atipicos(new_df[item].to_list())[0]
        print(valores_at)
        for value in valores_at:
            # Guarda en la lista los indices de las filas que sean iguales al value
            indices.append(new_df[new_df[item] == value].index[0])
        # Y despues las tira todas a la bosta
        new_df = new_df.drop(indices)
    return new_df


[3.4450618643852127, 3.641629777473381, 3.71608007538699, 3.902475685915096, 4.758439424671477, 5.115817063771218, 5.331940479018537, 5.400301780729467, 5.584086638456089, 5.618064405909149, 5.702925976824722, 6.347271760539316, 6.514415093251676, 6.660212026118103, 6.953864225511059, 7.051785800016845, 7.119824384264552, 7.145771545218821, 7.181448580829175, 7.360640105838258, 7.371050302429531, 7.414148196336244, 7.49623220797336, 7.80963189801941, 7.974521648923869, 8.099124189298397, 8.316765884214679, 8.635848718500734, 8.975464347533963, 9.092223456290965, 9.181560007151536, 10.223862164528772, 9.267187530763549, 9.920691058768153, 10.43329098280438, 11.180284470721592, 8.757257397440991, 9.82548990813439, 10.68296642580598, 1.844538366498842, 3.514545528745279, 3.72250048593372, 3.9060783549561697, 4.270716194733736, 4.999413810796919, 5.058108868565138, 5.488314309012512, 5.519125656437607, 5.704764776213017, 5.976769618087598, 6.099646705468387, 6.140877556947864, 6.2039777022

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
8,,118.988579,14285.583854,7.804174,268.646941,389.375566,12.706049,53.928846,3.595017,0
13,,150.174923,27331.361962,6.838223,299.415781,379.761835,19.370807,76.509996,4.413974,0
20,,227.435048,22305.567414,10.333918,,554.820086,16.331693,45.382815,4.133423,0
22,,215.977859,17107.224226,5.607060,326.943978,436.256194,14.189062,59.855476,5.459251,0
...,...,...,...,...,...,...,...,...,...,...
3224,,198.218700,31081.735264,7.419106,,517.925946,11.711419,85.428785,3.345543,1
3229,,203.204659,10643.186771,6.828936,,384.597711,16.011328,72.911573,3.065910,1
3231,,225.754109,28194.452646,5.892830,366.201583,418.272901,17.306832,103.912548,3.855895,1
3245,,188.536608,24711.414927,7.129520,,555.548534,16.959269,56.038702,4.331691,1


In [5]:
# Pruebas para ver que este cargando nans correctamente

df_0 = descarte(df, 'Potability', 0)
print('CERO\n')
print(df_0['ph'].median())
print(df_0['Sulfate'].median())
print(df_0['Trihalomethanes'].median())
df_1 = descarte(df, 'Potability', 1)
print('UNO\n')
print(df_1['ph'].median())
print(df_1['Sulfate'].median())
print(df_1['Trihalomethanes'].median())
df = carga_nans(df, df_0, df_1)

'''
DA UNO MAS EN TODOS, POR QUE QUEDA LA MEDIANA INICIAL CON EL VALOR DE POTABILITY YA SEA 0 O 1
PH: 
314 => 7.035455515887571
178 => 7.036752103833548

SULFATE
488 => 333.38942610169323
294 => 331.8381671295742

TRIHALOMETHANES
108 => 66.54219804427565
56 => 66.6782137100115


'''
i = 0
for value in df['Trihalomethanes']:
    if value == 66.6782137100115:
        i += 1
print('\n i: ',i)
    

CERO

7.035455515887571
333.38942610169323
66.54219804427565
UNO

7.036752103833548
331.8381671295742
66.6782137100115

 i:  56


In [6]:
limpieza(df)

[0.0, 0.2274990502021987, 0.975577989772022, 0.9899122128791388, 1.4317815547427415, 1.757037115490783, 1.844538366498842, 1.985383359263048, 2.1285314339651724, 2.376768075959951, 2.538115773481364, 2.5581027992200664, 2.5692435620279186, 2.612035914817261, 2.690831240408815, 2.798549098862777, 2.803563057437167, 2.9251743203391, 2.9454690611226875, 2.974429410478532, 3.1020755653395566, 3.14871228517616, 3.2309731059187423, 3.2616697873910208, 3.2728200975185464, 3.3376289979634506, 3.3445885334830234, 3.388090610523891, 3.41035967073924, 3.4223941642163864, 3.4264503676355647, 3.433874059145088, 3.434855759978651, 3.4450618643852127, 3.514545528745279, 3.551579176967154, 3.5908223640059966, 3.623165848258656, 3.629922064880713, 3.633162828862528, 3.637170625355805, 3.641629777473381, 3.657123132566738, 3.664710561719395, 3.6768449933847727, 3.678431807207628, 3.681076270548827, 3.6919326929347136, 3.715171359884693, 3.71608007538699, 3.717703934990307, 3.719791644371925, 3.722500485

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.035456,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
2,8.099124,224.236259,19909.541732,9.275884,333.389426,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
5,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,331.838167,392.449580,19.903225,66.678214,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,331.838167,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,331.838167,402.883113,11.168946,77.488213,4.708658,1


In [101]:
# Pruebas de la normalizacion
# Descomentar los prints para ver
# ESTO ESTABA MODIFICANDO EL DF ORIGINAL, SE ARREGLO <-
df_n = normalizacion(df)
# print('Data frame normalizado: \n',df_n)
# print('Data frame original: \n',df)
for item in df_n:
    df_n[item] = df_n[item]*df[item].max()
# print('Data frame normalizado multiplicado por el maximo de cada columna vuelve al original: \n', df_n)


In [102]:
# Limpieza de atipicos
# Sin atipicos
df = limpieza(df)

6.277742483390635
7.035455515887571
7.870853412371953


indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 3201, 3202, 3203, 3204, 3205, 3206, 3207, 3208, 3209, 3210, 3211, 3212, 3213, 3214, 3215, 3216, 3217, 3218, 3219, 3220, 3221, 3222, 3223, 3224, 3225, 3226, 3227, 3228, 3229, 3230, 3231, 3232, 3233, 3234, 3235, 3236, 3237, 3238, 3239, 3240, 3241, 3242, 3243, 3244, 3245, 3246, 3247, 3248, 3249, 3250, 3251, 3252, 3253, 3254, 3255, 3256, 3257, 3258, 3259, 3260, 3261, 3262, 3263, 3264, 3265, 3266, 3267, 3268, 3269, 3270, 3271, 3272, 3273, 3274, 3275]
indices: 
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 5