# Workflows de Análise de Dados e Geoestatística

## Módulo 4 - Construindo um Model de Árvore

O objetivo deste módulo e criar um modelo linear (regressão linear simples e múltipla)

### Imports

In [None]:
# paths
import os

# pandas
import pandas as pd
import numpy as np

# data viz
import seaborn as sns
import matplotlib.pyplot as plt

# model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# metrics
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


In [None]:
# configuring plots of seaborn
sns.set_style("ticks", {'axes.grid':False})
sns.set_context("talk") 

### Helper functions

In [None]:
# directories
def get_directories():
    # getting working paths
    CURRENT_DIR = os.path.dirname(os.path.realpath('__file__'))
    PARENT_DIR = os.path.dirname(CURRENT_DIR)
    #QUERY_DIR = os.path.join(CURRENT_DIR, 'queries')
    DATA_DIR = os.path.join(PARENT_DIR, 'data')
    
    dirs = {
        "CURRENT_DIR" : CURRENT_DIR,
        "PARENT_DIR" : PARENT_DIR,
        #"QUERY_DIR" : QUERY_DIR,
        "DATA_DIR" : DATA_DIR
    }

    return (dirs)


# metadados sobre dataframe
def meta_df(df):
    return pd.DataFrame({'dtypes': df.dtypes,
                         'nmissing': df.isna().sum(),
                         'missing%': round(df.isna().sum()/df.shape[0]*100, 2),
                         'nunique': df.nunique(),
                         'nunique%': round(df.nunique()/df.shape[0]*100, 2),
                         'size': df.shape[0]}, index = df.columns)

In [None]:
get_directories()

In [None]:
# importing data

data_dirs = get_directories()
df_jura = pd.read_csv(os.path.join(data_dirs["DATA_DIR"],"silver_jura.csv"))
df_jura.head(10)

In [None]:
meta_df(df_jura)

In [None]:
grade_features = ["vlCadmium", "vlCobalt", "vlChromium", "vlNickel", "vlLead", "vlZinc"]

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=df_jura['descUnit'], y=df_jura['vlCobalt'], color='darkslategray')
sns.despine()
plt.show()

#### Tree model

In [None]:
# train - test split
X_train, X_test, y_train, y_test = train_test_split(df_jura[grade_features], df_jura['descUnit'], random_state=1, test_size=.27)

y_train = y_train.map(lambda x: 0 if x != 'argoviano' else 1)
y_test = y_test.map(lambda x: 0 if x != 'argoviano' else 1)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X=X_train, y=y_train)

y_pred = clf.predict(X=X_test)
y_pred

In [None]:
# how many we got it right?
print(accuracy_score(y_test, y_pred))

# tp / tp + fp -> from retrieved samples, how many are relevant? 
print(precision_score(y_test, y_pred))

# tp / tp + fn -> how many relevant samples are retrieved?
print(recall_score(y_test, y_pred))


print(f1_score(y_test, y_pred))

In [None]:
fpr, tpr, tr = roc_curve(y_test, y_pred)

In [None]:
sns.lineplot(x=fpr, y=tpr)

In [None]:
y_pred

In [None]:
y_test