# Workflows de Análise de Dados e Geoestatística

## Módulo 2 - Análise Exploratória e Visualização de Dados com Python

O objetivo deste módulo e apresentar conceitos e funções iniciais da análise exploratória de dados (ou **EDA** - *exploratory data analysis*), utilizando a linguagem Python.

A idéia é apresentar o básico de bibliotecas voltadas a dados e utilizá-las para conhencermos os dados que estamos trabalhando.

### Imports

In [5]:
# paths
import os

# pandas
import pandas as pd

# data viz
import plotly
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
# configuring plots of seaborn

sns.set_style("white", {'axes.grid':False}, )

### Helper Functions

In [7]:
# directories
def get_directories():
    # getting working paths
    CURRENT_DIR = os.path.dirname(os.path.realpath('__file__'))
    PARENT_DIR = os.path.dirname(CURRENT_DIR)
    #QUERY_DIR = os.path.join(CURRENT_DIR, 'queries')
    DATA_DIR = os.path.join(PARENT_DIR, 'data')
    
    dirs = {
        "CURRENT_DIR" : CURRENT_DIR,
        "PARENT_DIR" : PARENT_DIR,
        #"QUERY_DIR" : QUERY_DIR,
        "DATA_DIR" : DATA_DIR
    }

    return (dirs)


# metadados sobre dataframe
def meta_df(df):
    return pd.DataFrame({'dtypes': df.dtypes,
                         'nmissing': df.isna().sum(),
                         'missing%': round(df.isna().sum()/df.shape[0]*100, 2),
                         'nunique': df.nunique(),
                         'nunique%': round(df.nunique()/df.shape[0]*100, 2),
                         'size': df.shape[0]}, index = df.columns)

In [8]:
get_directories()

{'CURRENT_DIR': 'C:\\Users\\lucas\\Documents\\code\\workflows-geoestatistica-data\\modulo_2',
 'PARENT_DIR': 'C:\\Users\\lucas\\Documents\\code\\workflows-geoestatistica-data',
 'DATA_DIR': 'C:\\Users\\lucas\\Documents\\code\\workflows-geoestatistica-data\\data'}

In [14]:
# importing data

data_dirs = get_directories()
df_jura = pd.read_csv(os.path.join(data_dirs["DATA_DIR"],"silver_jura.csv"))
df_jura.head(10)

Unnamed: 0,vlLocX,vlLocY,descLanduse,descUnit,vlCadmium,vlCobalt,vlChromium,vlNickel,vlLead,vlZinc,status,rankCadmium,rankCobalt,rankChromium,rankNickel,rankPb,rankZn,descAnomalyZn
0,1.932,1.004,campo,argoviano,0.135,4.52,15.08,5.24,37.0,32.56,,0.0,0.14,0.02,0.03,0.28,0.04,normal
1,1.824,0.999,campo,argoviano,0.165,3.752,18.6,7.08,56.4,44.0,,0.0,0.07,0.06,0.07,0.68,0.12,normal
2,1.883,1.176,campo,argoviano,0.195,3.92,21.8,7.52,49.6,43.2,,0.01,0.1,0.12,0.1,0.56,0.1,normal
3,1.837,1.037,campo,argoviano,0.215,3.88,23.0,8.16,46.8,44.0,,0.01,0.09,0.14,0.11,0.51,0.12,normal
4,3.768,2.775,floresta,kimmeridgiano,0.22,10.8,26.32,17.52,31.88,45.2,,0.02,0.6,0.2,0.37,0.11,0.14,normal
5,1.839,0.995,campo,argoviano,0.225,4.24,24.88,10.12,55.6,56.0,,0.02,0.12,0.17,0.16,0.68,0.26,normal
6,2.537,3.768,campo,argoviano,0.23,3.792,18.2,5.68,26.84,32.08,,0.02,0.08,0.04,0.05,0.04,0.03,normal
7,2.502,3.988,campo,argoviano,0.24,3.76,18.92,5.2,22.36,32.24,,0.03,0.08,0.06,0.03,0.02,0.04,normal
8,1.842,0.989,campo,argoviano,0.24,4.52,27.96,11.32,52.4,56.4,,0.03,0.14,0.26,0.2,0.62,0.26,normal
9,1.897,1.22,campo,argoviano,0.26,3.48,16.24,4.76,27.0,27.2,,0.04,0.04,0.03,0.01,0.05,0.01,normal


In [15]:
meta_df(df_jura)

Unnamed: 0,dtypes,nmissing,missing%,nunique,nunique%,size
vlLocX,float64,0,0.0,238,94.82,251
vlLocY,float64,0,0.0,241,96.02,251
descLanduse,object,0,0.0,4,1.59,251
descUnit,object,0,0.0,5,1.99,251
vlCadmium,float64,0,0.0,209,83.27,251
vlCobalt,float64,0,0.0,172,68.53,251
vlChromium,float64,0,0.0,198,78.88,251
vlNickel,float64,0,0.0,213,84.86,251
vlLead,float64,0,0.0,192,76.49,251
vlZinc,float64,0,0.0,183,72.91,251


### Visualização de Dados (*Data Vizualization - dataviz*)

In [13]:
fig = px.scatter(
    x=df_jura["vlCadmium"],
    y=df_jura["vlZinc"], 
    color=df_jura['descUnit'],
    title="Scatter Cd X Zn"
  )

fig.update_layout(width=800, height=500, title_x=0.5,template="simple_white",font_family="Calibri", margin=dict(r=20,b=20)
)
fig.update_xaxes(showline=True,linewidth=0.5,linecolor='black',title_text='x',ticks="inside",tickwidth=0.5)
fig.update_yaxes(showline=True,linewidth=0.5,linecolor='black',title_text='y',ticks="inside",tickwidth=0.5)
fig.show()