# Desafío - Conceptos previos a Big Data

In [1]:
import random

def create_random_row():
    # simulamos la columna edad
    age = random.randint(18, 90)
    # simulamos la columna ingreso
    income = random.randrange(10000, 1000000, step=1000)
    # simulamos la situación laboral
    employment_status = random.choice(['Unemployed', 'Employed'])
    # simulamos si es que tiene deuda o no
    debt_status = random.choice(['Debt', 'No Debt'])
    # use in minutes?
    use_in_minutes = random.randrange(10000, 1000000, step=1000)
    # simulamos si es que se cambió recientemente o no
    churn_status = random.choice(['Churn', 'No Churn'])
    return age, income, employment_status, use_in_minutes, debt_status, churn_status

## Ejercicio 1

In [2]:
random_database = [create_random_row() for _ in range(10000000)]

## Ejercicio 2

```python
employment_income_looped = 0

for i in random_database:
    if i[2] == 'Employed':
        employment_income_looped += i[1]

# retorno
2523162067000
```

### ¿Qué retornará la variable `employment_income_looped`?

El total del ingreso de las personas con empleo

### ¿Cómo sería una implementación del código utilizando map y filter ?

In [3]:
%%time

employed_only = lambda row: row[2] == 'Employed'
to_income = lambda row: row[1]
employment_income = sum(map(to_income, filter(employed_only, random_database)))

CPU times: user 1.39 s, sys: 24.2 ms, total: 1.42 s
Wall time: 1.42 s


### ¿Son iguales los resultados?

Primero hay que ejecutar el código:

In [4]:
%%time

employment_income_looped = 0

for i in random_database:
    if i[2] == 'Employed':
        employment_income_looped += i[1]

CPU times: user 1.28 s, sys: 5.38 ms, total: 1.28 s
Wall time: 1.28 s


Luego se compara con el resultado anterior:

In [5]:
employment_income == employment_income_looped

True

Sí, son iguales los resultados

## Ejercicio 3

Desde la gerencia le solicitan mejorar la siguiente línea de código:

In [6]:
%%time

count_debts_looped = 0

for i in random_database:
    for j in i:
        if j == 'Debt':
            count_debts_looped += 1

CPU times: user 3.9 s, sys: 11 ms, total: 3.91 s
Wall time: 3.92 s


### ¿Cuál será el retorno de la variable `count_debts_looped` ?

La cantidad de personas que están endeudados

### ¿Cuál es la complejidad algorítmica del código?

Es $O(n)$, u $O(6n)$ (filas * columnas)

### ¿Cómo sería una implementación del código utilizando `map` y `filter`?

In [7]:
%%time

count_debts = sum(map(lambda x: 1, filter(lambda x: x == 'Debt', map(lambda row: row[4], random_database))))

CPU times: user 1.47 s, sys: 6.15 ms, total: 1.48 s
Wall time: 1.48 s


### ¿Son iguales los resultados de ambas operaciones?

In [8]:
count_debts == count_debts_looped

True

Sí, son iguales

## Ejercicio 4

Desde la gerencia le solicitan mejorar la siguiente línea de código:

In [9]:
%%time

churn_subset, no_churn_subset = [], []

for i in random_database:
    for j in i:
        if j == 'Churn':
            churn_subset.append(i)
    for j in i:
        if j == 'No Churn':
            no_churn_subset.append(i)

CPU times: user 7.67 s, sys: 44.2 ms, total: 7.71 s
Wall time: 7.72 s


### ¿Cuál será el retorno de la variable `churn_subset` y `no_churn_subset`?

`churn_subset` es una lista con están las observacione que se cambiaron recientemente, y `no_churn_subset` las observaciones que no se cambiaron recientemente. Es decir, el código particiona `random_database` en base al `churn`.

### ¿Cuál es la complejidad algorítmica del código?

Es $O(n)$, o $O(12n)$ (filas * 2 * columnas)

### ¿Cómo sería una implementación del código utilizando `map` y `filter`?

In [10]:
%%time 
churn = list(filter(lambda row: row[5] == 'Churn', random_database))
no_churn = list(filter(lambda row: row[5] == 'No Churn', random_database))

CPU times: user 1.85 s, sys: 38.5 ms, total: 1.89 s
Wall time: 1.89 s


### ¿Son iguales los resultados de ambas operaciones?

Para comparar si son iguales, primero compararé los largos de las listas y luego item a item

In [11]:
def are_equal(s1, s2):
    if len(s1) != len(s2):
        return False
    
    return len(list(filter(lambda r: s2[r[0]] != r[1], enumerate(s1)))) == 0

In [12]:
are_equal(churn, churn_subset)

True

In [13]:
are_equal(no_churn, no_churn_subset)

True

Sí, son iguales los resultados.

### Estime la media, la varianza, el mínimo y el máximo de la edad para ambos subsets, sin utilizar librerías externas.

In [13]:
def report(subset):
    to_age = lambda row: row[0]
    n = len(subset)
    mean = sum(map(to_age, subset)) / n
    print("La edad media es", mean)
    print("La varianza es", sum(map(lambda row: row[0]**2 / n, subset)) - mean ** 2)
    print("La edad mínima es", min(map(to_age, subset)))
    print("La edad máxima es", max(map(to_age, subset)))

In [14]:
report(no_churn)

La edad media es 53.99098075534424
La varianza es 443.76326474241705
La edad mínima es 18
La edad máxima es 90


In [15]:
report(churn)

La edad media es 53.99351309519286
La varianza es 443.96613159336766
La edad mínima es 18
La edad máxima es 90


## Ejercicio 5

In [16]:
unemployed_debt_churn = 0
unemployed_nodebt_churn = 0
unemployed_debt_nochurn = 0
unemployed_nodebt_nochurn = 0
employed_debt_churn = 0
employed_nodebt_churn = 0
employed_debt_nochurn = 0
employed_nodebt_nochurn = 0

for i in random_database:
    if i[2] == 'Unemployed' and i[4] == 'Debt' and i[5] == 'Churn':
        unemployed_debt_churn += 1
    if i[2] == 'Unemployed' and i[4] == 'No Debt' and i[5] == 'Churn':
        unemployed_nodebt_churn += 1
    if i[2] == 'Unemployed' and i[4] == 'Debt' and i[5] == 'No Churn':
        unemployed_debt_nochurn += 1
    if i[2] == 'Unemployed' and i[4] == 'No Debt' and i[5] == 'No Churn':
        unemployed_nodebt_nochurn += 1
    if i[2] == 'Employed' and i[4] == 'Debt' and i[5] == 'Churn':
        employed_debt_churn += 1
    if i[2] == 'Employed' and i[4] == 'No Debt' and i[5] == 'Churn':
        employed_nodebt_churn += 1
    if i[2] == 'Employed' and i[4] == 'Debt' and i[5] == 'No Churn':
        employed_debt_nochurn += 1
    if i[2] == 'Employed' and i[4] == 'No Debt' and i[5] == 'No Churn':
        employed_nodebt_nochurn += 1

print("Unemployed, Debt, Churn:", unemployed_debt_churn)
print("Unemployed, No Debt, Churn:", unemployed_nodebt_churn)
print("Unemployed, Debt, No Churn:", unemployed_debt_nochurn)
print("Unemployed, No Debt, No Churn: ", unemployed_nodebt_nochurn)
print("Employed, Debt, Churn: ", employed_debt_churn)
print("Employed, No Debt, Churn:", employed_nodebt_churn)
print("Employed, Debt, No Churn:", employed_debt_nochurn)
print("Employed, No Debt, No Churn:", employed_nodebt_nochurn)

Unemployed, Debt, Churn: 1249865
Unemployed, No Debt, Churn: 1250859
Unemployed, Debt, No Churn: 1249457
Unemployed, No Debt, No Churn:  1249155
Employed, Debt, Churn:  1249506
Employed, No Debt, Churn: 1250460
Employed, Debt, No Churn: 1251184
Employed, No Debt, No Churn: 1249514


### ¿Cómo sería una implementación utilizando map ?

In [19]:
filters = [
    lambda i: i[2] == 'Unemployed' and i[4] == 'Debt' and i[5] == 'Churn',
    lambda i: i[2] == 'Unemployed' and i[4] == 'No Debt' and i[5] == 'Churn',
    lambda i: i[2] == 'Unemployed' and i[4] == 'Debt' and i[5] == 'No Churn',
    lambda i: i[2] == 'Unemployed' and i[4] == 'No Debt' and i[5] == 'No Churn',
    lambda i: i[2] == 'Employed' and i[4] == 'Debt' and i[5] == 'Churn',
    lambda i: i[2] == 'Employed' and i[4] == 'No Debt' and i[5] == 'Churn',
    lambda i: i[2] == 'Employed' and i[4] == 'Debt' and i[5] == 'No Churn',
    lambda i: i[2] == 'Employed' and i[4] == 'No Debt' and i[5] == 'No Churn',
]

result = list(map(lambda f: sum(map(lambda row: 1 if f(row) else 0, random_database)), filters))

### ¿Son iguales los resultados de ambas operaciones?

In [17]:
texts = [
    'Unemployed, Debt, Churn',
    'Unemployed, No Debt, Churn',
    'Unemployed, Debt, No Churn',
    'Unemployed, No Debt, No Churn',
    'Employed, Debt, Churn',
    'Employed, No Debt, Churn',
    'Employed, Debt, No Churn',
    'Employed, No Debt, No Churn'
]

In [20]:
for index, text in enumerate(texts):
    print(f"{text}: {result[index]}")

Unemployed, Debt, Churn: 1249865
Unemployed, No Debt, Churn: 1250859
Unemployed, Debt, No Churn: 1249457
Unemployed, No Debt, No Churn: 1249155
Employed, Debt, Churn: 1249506
Employed, No Debt, Churn: 1250460
Employed, Debt, No Churn: 1251184
Employed, No Debt, No Churn: 1249514


Sí, lo son 😊