<div style="background-color:#173267;">
    <br>
    <img src="logo-uc-01.svg" style="align:center;width:20%;">
    <p style="text-align:center;font-family:Trebuchet MS;color:white;font-size:40pt;font-weight:bold;margin:50px">
        AYUDANTÍA 7
    </p>
    <p style="text-align:center;font-family:Trebuchet MS;color:white;font-size:20pt;font-weight:bold;margin:50px">
        Introducción a la Ciencia de Datos
    </p>
    <p style="text-align:center;font-family:Trebuchet MS;color:white;font-size:12pt;">
        Felipe Gutiérrez - figutier@uc.cl
        Nicolas Mendicoa - nmendicoa@uc.cl
    </p>
    <p style="text-align:center;font-family:Trebuchet MS;color:white;font-size:12pt;">        
        Modificado notebook de:
        Vicente Agüero - vicenteaguero@uc.cl
        <br>27 de Septiembre de 2022
    </p>
    <br><br>
</div>

<p style="text-align:center;font-family:Arial;color:#173267;font-size:20pt;font-weight:bold;">
    Preliminares
</p>

In [None]:
from matplotlib import pyplot as plt
import geopandas as gpd
import pandas as pd
import numpy as np
import ipywidgets

In [None]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 100)

<p style="text-align:center;font-family:Arial;color:#173267;font-size:20pt;font-weight:bold;">
    Spatial Join
</p>

<p style="text-align:left;font-family:Arial;color:#173267;font-size:14pt;">
    Cargar datos
</p>

In [None]:
crimes_data = pd.read_csv('data/dataset-limpo.csv')
display(crimes_data.shape)
crimes_data.head(1)

In [None]:
crimes = gpd.GeoDataFrame(crimes_data[['bairro', 'descricao', 'endereco']], geometry=gpd.points_from_xy(crimes_data.longitude, crimes_data.latitude), crs='epsg:4326').to_crs(epsg='5641')
display(crimes.shape)
display(crimes.sample(1))
fig, ax = plt.subplots(figsize=(10, 10))
crimes.plot(ax=ax, column='bairro')
plt.show()

In [None]:
sao = gpd.read_file('data/Sao Paulo Municipalities/').to_crs(epsg='5641')
display(sao.shape)
display(sao.sample(1))
fig, ax = plt.subplots(figsize=(10, 10))
crimes.plot(ax=ax, color='red', zorder=1)
sao.plot(ax=ax, column='CODMESO', zorder=0)
plt.show()

In [None]:
sjoin = gpd.sjoin(sao, crimes, how='inner')[['NOMEMUNICP', 'geometry']]

In [None]:
gdf = sjoin.groupby('NOMEMUNICP').sample(1)
gdf['Cantidad de Crímenes'] = sjoin.groupby('NOMEMUNICP').count().values.reshape(-1)
fig, ax = plt.subplots(figsize=(20, 8))
gdf.plot(ax=ax, column='Cantidad de Crímenes', cmap='Reds', legend=True, vmax=10, zorder=0)
sao.geometry.boundary.plot(ax=ax, color='black', zorder=1, linewidth=0.5)
gdf.geometry.boundary.plot(ax=ax, color='black', zorder=1, linewidth=0.5)
plt.show()

<p style="text-align:center;font-family:Arial;color:#173267;font-size:20pt;font-weight:bold;">
    Describe
</p>

In [None]:
players = pd.read_csv('data/players_fifa22.csv')
display(players.shape)
players.head(1)

In [None]:
teams = pd.read_csv('data/teams_fifa22.csv')
display(teams.shape)
teams.head(1)

In [None]:
players.describe()

In [None]:
players.describe(percentiles=[0, 0.005, 0.01, 0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 0.90, 0.99, 1])

In [None]:
players['Age'].describe()

In [None]:
players.mean(numeric_only=True)

In [None]:
players.std(numeric_only=True)

In [None]:
players.groupby('Nationality').mean()

In [None]:
%%time
players.sample(20).groupby(['Nationality', 'Age']).describe()

<p style="text-align:center;font-family:Arial;color:#173267;font-size:20pt;font-weight:bold;">
    Value Counts
</p>

In [None]:
players.columns

In [None]:
players['Nationality'].value_counts()

In [None]:
players.value_counts('Nationality')

In [None]:
%%timeit
players['Nationality'].value_counts()

In [None]:
%%timeit
players.value_counts('Nationality')

In [None]:
players.value_counts(['Nationality', 'Age'])

<p style="text-align:center;font-family:Arial;color:#173267;font-size:20pt;font-weight:bold;">
    Random
</p>

In [None]:
np.random

In [None]:
np.random.rand(4)

In [None]:
np.random.rand(2, 2)

In [None]:
np.random.rand(2, 2, 3)

In [None]:
plt.figure(figsize=(5, 5))
plt.hist(np.random.rand(1000))
plt.show()

In [None]:
np.random.randn(10)

In [None]:
plt.figure(figsize=(5, 5))
plt.hist(np.random.randn(1000000), bins=100)
plt.show()

In [None]:
np.random.randint(0, 100)

In [None]:
np.random.randint(0, 100, 10)

In [None]:
np.random.random(10)

In [None]:
a = np.array(list('abcdefghijklmnñopqrstuvxyz'))
a

In [None]:
np.random.choice(a, size=10, replace=True)

In [None]:
np.random.permutation(a)

In [None]:
a

In [None]:
np.random.shuffle(a)

In [None]:
a

In [None]:
np.random.seed(2305) # Setear el RandomState
np.random.rand(5)

In [None]:
np.random.seed(2305)
np.random.rand(5)

In [None]:
np.random.rand(5)

<p style="text-align:center;font-family:Arial;color:#173267;font-size:20pt;font-weight:bold;">
    Normal Distribution (Gaussian)
</p>

<p style="text-align:left;font-family:Arial;color:#173267;font-size:14pt;">
    $$X\sim N(0,1)$$
    $$f(x)=\frac{1}{\sqrt{2 \pi}}\cdot \exp \left\{-\frac{x^2}{2}\right\}$$
    <br><br>
    $$X\sim N(\mu, \sigma)$$
    $$f(x)=\frac{1}{\sqrt{2 \pi \sigma^2}}\cdot \exp \left\{-\frac{(x - \mu)^2}{2\sigma^2}\right\}$$
    <br><br>
    $$X\sim N(\mu, \sigma) \Rightarrow Y = \frac{X - \mu}{\sigma} \sim N(0, 1)$$
</p>

In [None]:
np.random.seed(0)
X = np.random.randn(200)

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(X)
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
plt.hist(X, bins=21)
plt.show()

In [None]:
f = lambda t: (1 / np.sqrt(2*np.pi)) * np.exp(-t**2/2)

In [None]:
t = np.linspace(-3.5, 3.5, num=1000)
plt.figure(figsize=(10, 10))
plt.hist(X, bins=21, density=True)
plt.plot(t, f(t), color='red', linewidth=10)
plt.show()

In [None]:
@ipywidgets.interact(n=(10, 100000, 1000), bins=(10, 100, 5))
def normal_plot(n=10, bins=10):
    np.random.seed(0)
    plt.figure(figsize=(8, 8))
    plt.hist(np.random.randn(n), bins=bins, density=True, zorder=0)
    plt.plot(t, f(t), color='red', linewidth=3, zorder=1)
    plt.ylim((0, 1))
    plt.xlim((-3.5, 3.5))
    plt.show()

In [None]:
@ipywidgets.interact(n=np.logspace(0, 6, num=20, dtype=int))
def normal_plot(n=10):
    plt.figure(figsize=(10, 8))
    x = np.random.randn(n)
    y = np.random.randn(n)
    plt.scatter(x, y, c=np.sqrt(x**2+y**2), cmap='jet', s=10)
    plt.xlim((-3.5, 3.5))
    plt.ylim((-3.5, 3.5))
    plt.colorbar()
    plt.grid(alpha=0.25)
    plt.title(n)
    plt.show()

<p style="text-align:left;font-family:Arial;color:#173267;font-size:14pt;">
    Normalización Estándar
    $$X\sim N(\mu, \sigma) \Rightarrow Y = \frac{X - \mu}{\sigma} \sim N(0, 1)$$
</p>

In [None]:
teams.sample(5)

In [None]:
teams.describe()

In [None]:
teams_normal = (teams['Overall'] - teams['Overall'].mean()) / teams['Overall'].std()

In [None]:
plt.figure(figsize=(10, 10))
plt.hist(teams_normal, bins=25, density=True)
plt.plot(t, f(t), color='red', linewidth=3)
plt.show()

In [None]:
teams[teams_normal > 2.5]

<p style="text-align:left;font-family:Arial;color:#173267;font-size:14pt;">
    Normalización lineal
    $$X \in \left[X_\text{mín}, X_\text{máx}\right] \Rightarrow Y \in [0, 1]$$
    <br>
    $$\frac{X - X_\text{mín}}{X_\text{máx} - X_\text{mín}}$$
    <br><br>
    $$X \in \left[X_\text{mín}, X_\text{máx}\right] \Rightarrow Y \in [a, b]$$
    <br>
    $$a + \frac{(X - X_\text{mín})(b - a)}{X_\text{máx} - X_\text{mín}}$$
</p>

In [None]:
teams_0_to_1 = (teams['Attack'] - teams['Attack'].min()) / (teams['Attack'].max() - teams['Attack'].min())
plt.figure(figsize=(10, 5))
plt.hist(teams_0_to_1, bins=25, density=True)
#plt.plot(t, f(t), color='red', linewidth=3)
plt.show()

In [None]:
teams_a_to_b = lambda a, b: a + ((teams['Attack'] - teams['Attack'].min()) * (b - a)) / (teams['Attack'].max() - teams['Attack'].min())
plt.figure(figsize=(10, 5))
plt.hist(teams_a_to_b(0.5, 0.51), bins=25, density=True)
#plt.plot(t, f(t), color='red', linewidth=3)
plt.show()