# Imports & global vars

In [10]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets #Retirar após o uso do dataset do Hackday
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
rawData = pd.DataFrame()
outliersData = pd.DataFrame()

# Functions

In [12]:
def loadData(x: str):
    """Função para carregar arquivo csv com a base de dados"""
    global rawData
    if not os.path.exists(x):
        print("Arquivo não encontrado.")
        return None
    rawData = pd.DataFrame(pd.read_csv(x))
    return

def checkData():
    print(f"Info dos valores:\n{rawData.info()}\n")
    print(f"Descrição dos dados:\n{rawData.describe()}\n")
    print(f"Número de nulos por coluna:\n{rawData.isnull().sum()}\n")
    print(f"Tipos das colunas:\n{rawData.dtypes}\n")
    return

def checkCorrelation():
    plt.figure(figsize=(10,8))
    sns.heatmap(rawData.select_dtypes(include=[np.number]).corr(), annot=True)
    plt.show()
    return

def checkOutliers():
    global outliersData
    outliers_df = pd.DataFrame()
    # Itera sobre as colunas númericas
    for column in rawData.select_dtypes(include=[np.number]).columns:
        # Calcula o 1 quartil (Q1)
        Q1 = rawData[column].quantile(0.25)
        # Calcula o 3 quartil (Q3)
        Q3 = rawData[column].quantile(0.75)
        # Calcula o Intervalo Interquartil (IQR)
        IQR = Q3 - Q1
        # Define os limites inferior e superior para outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Identica os outliers da coluna atual e adiciona ao DataFrame de outliers
        outliers = rawData[(rawData[column] < lower_bound) | (rawData[column] > upper_bound)]
        # Concatene os outliers da coluna atual ao DataFrame de outliers
        outliers_df = pd.concat([outliers_df, outliers])
    outliersData = outliers_df.drop_duplicates().reset_index(drop=True)
    print(f"Sample de outliers:\n{outliersData.sample(5)}")
    return

# Loading data

In [13]:
loadData("dataset/concatData.csv")

  rawData = pd.DataFrame(pd.read_csv(x))


# Descrição

In [14]:
df = rawData.drop(columns=['author_id'])
df.sample(10)

Unnamed: 0,book_id,title_x,first_name,last_name,birthday,country_residence,hrs_writing_day,title_y,award_name,year_won,...,country,year_established,marketing_spend,pub_id_y,series_name,sale_date,discount,item_id,order_id,mean_rating
50511,HP265,Heliotrope Pajamas,Malin,Wolff,31/1/2141,Hong Kong,6.0,Heliotrope Pajamas,Newberry Medal,2182.0,...,USA,1906.0,72000.0,CHP,,15/12/2193,,107367-4-5665,107367-46893,4.543651
118446,ST862,She Also Tottered,Robert,Plimpton,11/11/2160,Canada,13.0,,,,...,USA,1889.0,2320000.0,ESP,,13/7/2193,,107212-74-5337,107212-21645,4.088567
39531,HP265,Heliotrope Pajamas,Malin,Wolff,31/1/2141,Hong Kong,6.0,Heliotrope Pajamas,Newberry Medal,2182.0,...,USA,1906.0,72000.0,CHP,,7/12/2193,,107359-31-7340,107359-45397,4.543651
255511,TM925,The Mallemaroking,Bianca,Thompson,28/1/2150,United States,5.0,The Mallemaroking,Nebula Award,2179.0,...,USA,1889.0,2320000.0,ESP,The Mallemaroking Saga,18/7/2193,,107217-54-3410,107217-22764,4.659653
176197,TM925,The Mallemaroking,Bianca,Thompson,28/1/2150,United States,5.0,The Mallemaroking,Hugo Award,2179.0,...,USA,1889.0,2320000.0,ESP,The Mallemaroking Saga,20/7/2193,,107219-2-7053,107219-23287,4.659653
193625,TM925,The Mallemaroking,Bianca,Thompson,28/1/2150,United States,5.0,The Mallemaroking,Hugo Award,2179.0,...,USA,1889.0,2320000.0,ESP,The Mallemaroking Saga,30/5/2193,,107168-55-7536,107168-13829,4.659653
268331,TM925,The Mallemaroking,Bianca,Thompson,28/1/2150,United States,5.0,The Mallemaroking,Nebula Award,2179.0,...,USA,1889.0,2320000.0,ESP,The Mallemaroking Saga,2/9/2193,,107263-97-5886,107263-34075,4.659653
22911,CP573,Concerning Prophecy,Grace,Harrison,8/4/2161,United States,6.0,,,,...,USA,1889.0,2320000.0,ESP,,28/8/2193,,107258-16-2624,107258-32729,3.72619
75759,NR695,9803 North Millworks Road,Carolyn,Segal,24/9/2133,United States,7.0,,,,...,USA,1889.0,2320000.0,ESP,Inspector Ryeslton,26/9/2193,,107287-51-1245,107287-37509,4.089084
27980,HP265,Heliotrope Pajamas,Malin,Wolff,31/1/2141,Hong Kong,6.0,Heliotrope Pajamas,Newberry Medal,2182.0,...,USA,1906.0,72000.0,CHP,,14/6/2193,,107183-59-9959,107183-15980,4.543651


# Limpeza

In [15]:
cleanData = rawData.drop(columns=['author_id'])