In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
# Read the dataset from the CSV file
def read_dataset(path):
    file = pd.read_csv(path)
    return file

# Calculate the mean for each column
def mean_for_each_column(dataset):
    print("..........Mean for each column..........")
    print(str(dataset.mean(numeric_only=True))+"\n")

# Calculate the median for each column
def median_for_each_column(dataset):
    print("..........Median for each column..........")
    print(str(dataset.median(numeric_only=True))+"\n")

# Calculate the mode for each column
def mode_for_each_column(dataset):
    print("..........Mode for each column..........")
    print(str(dataset.mode().iloc[0]) + "\n")

# Calculate the % of missing values for each column
def missing_value_percentage_for_each_column(dataset):
    total = dataset.shape[0]  # Number of lines
    missing_values = dataset.isnull().sum()  # Sum of the missing values in each column
    percentage = (missing_values / total) * 100  # Calculate the percentage of missing values in each column
    print("..........% of missing values..........")
    print(percentage)

# Box-plot for each column
def box_plot_for_each_column(dataset):
    numeric_columns = dataset.select_dtypes(include='number')
    if numeric_columns.empty:
        print("No numeric columns found in the dataset.")
    else:
        numeric_columns.boxplot(figsize=(10, 6))
        plt.title("Boxplot for all numeric columns")
        plt.xticks(rotation=45)  # Rotation in x, if necessary
        plt.show()

# Bar-chart for each column
def bar_chart_for_each_column(dataset):
    non_numeric_columns = dataset.select_dtypes(exclude='number')
    if non_numeric_columns.empty:
        print("Any non-numeric columns found in the dataset.")
    else:
        for column in non_numeric_columns.columns:
            value_counts = non_numeric_columns[column].value_counts()
            plt.figure(figsize=(10, 6))
            value_counts.plot(kind='bar')
            plt.title(f"Bar chart for '{column}'")
            plt.xlabel(column)
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.tight_layout()  # Adjust layout to prevent overlap
            plt.show()

# Pie-chart for each column
def pie_chart_for_each_column(dataset):
    non_numeric_columns = dataset.select_dtypes(exclude='number')
    
    if non_numeric_columns.empty:
        print("Any non-numeric columns found in the dataset.")
    else:
        for column in non_numeric_columns.columns:
            # Count elements from different categories
            category_counts = dataset[column].value_counts()
            
            # Pie-chart
            plt.figure(figsize=(6, 6))
            category_counts.plot.pie(autopct='%1.1f%%', startangle=140)
            plt.title(f'Distribution of {column}')
            plt.ylabel('')  # Remove o rótulo do eixo Y
            plt.show()

#Pearson-correlation
def pearson_correlation(dataset):
    numeric_columns = dataset.select_dtypes(include='number')
    
    if numeric_columns.empty:
        print("Nenhuma coluna numérica encontrada no dataset.")
    else:
        # Correlation matrix
        correlation_matrix = numeric_columns.corr()

        # View
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Pearson-correlation')
        plt.show()

#normalizar os valores entre o valor minimo e maximo
def normalize_min_max(dataset):
    # Seleciona as colunas numéricas
    numeric_columns = dataset.select_dtypes(include='number')
    
    # Verifica se existem colunas numéricas
    if numeric_columns.empty:
        print("Nenhuma coluna numérica encontrada no dataset.")
        return dataset  # Retorna o dataset sem modificações se não houver colunas numéricas
    
    # Aplica a normalização Min-Max nas colunas numéricas
    dataset[numeric_columns.columns] = (numeric_columns - numeric_columns.min()) / (numeric_columns.max() - numeric_columns.min())
    
    return dataset

#normalizar com base em média 0 e desvio padrão 1
def normalize_mean_std(dataset):
    # Seleciona as colunas numéricas
    numeric_columns = dataset.select_dtypes(include='number')
    
    # Verifica se existem colunas numéricas
    if numeric_columns.empty:
        print("Nenhuma coluna numérica encontrada no dataset.")
        return dataset  # Retorna o dataset sem modificações se não houver colunas numéricas
    
    # Aplica a normalização Min-Max nas colunas numéricas
    dataset[numeric_columns.columns] = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()
    
    return dataset

# Guardar o dataset modificado numa nova pasta
def save_cleaned_dataset(dataset, original_filename, new_folder_name='datasetsTratados'):
    # Criar a nova pasta, se não existir
    if not os.path.exists(new_folder_name):
        os.makedirs(new_folder_name)
    
    # Definir o novo caminho de destino para o dataset
    base_filename = os.path.basename(original_filename)
    new_file_path = os.path.join(new_folder_name, base_filename)
    
    # Guardar o novo dataset
    dataset.to_csv(new_file_path, index=False)
    print(f"Dataset guardado em: {new_file_path}")


In [None]:
# Definir o caminho para o dataset (ou podes usar uma célula separada para carregar diretamente via upload no Jupyter)
current_dir = os.getcwd()
datasets_dir = os.path.join(current_dir, 'datasets')
awards_players_path = os.path.join(datasets_dir, 'awards_players.csv')

# Ler o dataset
try:
    file = read_dataset(awards_players_path)
    print("..........Dataset..........")
    print(file) 

except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Definir o caminho para o dataset (ou podes usar uma célula separada para carregar diretamente via upload no Jupyter)
current_dir = os.getcwd()
datasets_dir = os.path.join(current_dir, 'datasets')
awards_players_path = os.path.join(datasets_dir, 'awards_players.csv')

# Ler o dataset
try:
    file = read_dataset(awards_players_path)
    
    # Calcular e imprimir a média, mediana, moda e % de valores em falta
    mean_for_each_column(file)
    median_for_each_column(file)
    mode_for_each_column(file)
    missing_value_percentage_for_each_column(file)

except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print(f"Error: {e}")


In [None]:
# Definir o caminho para o dataset (ou podes usar uma célula separada para carregar diretamente via upload no Jupyter)
current_dir = os.getcwd()
datasets_dir = os.path.join(current_dir, 'datasets')
awards_players_path = os.path.join(datasets_dir, 'awards_players.csv')

# Ler o dataset
try:
    file = read_dataset(awards_players_path)

    # Criar box-plot e gráfico de barras
    box_plot_for_each_column(file)
    bar_chart_for_each_column(file)
    pie_chart_for_each_column(file)

except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Definir o caminho para o dataset (ou podes usar uma célula separada para carregar diretamente via upload no Jupyter)
current_dir = os.getcwd()
datasets_dir = os.path.join(current_dir, 'datasets')
awards_players_path = os.path.join(datasets_dir, 'awards_players.csv')

# Ler o dataset
try:
    file = read_dataset(awards_players_path)

    # Correlations
    pearson_correlation(file)

except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Definir o caminho para o dataset (ou podes usar uma célula separada para carregar diretamente via upload no Jupyter)
current_dir = os.getcwd()
datasets_dir = os.path.join(current_dir, 'datasets')
awards_players_path = os.path.join(datasets_dir, 'awards_players.csv')

# Ler o dataset
try:
    file = read_dataset(awards_players_path)
    
    print("..........Normalizacao com base min-max..........")
    print(normalize_min_max(file))
    print("..........Normalizacao com base moda e desvio padrao..........")
    print(normalize_mean_std(file))

except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print(f"Error: {e}")


In [15]:
import os
import csv

# Função para ler dataset como dicionários
def read_dataset(filepath):
    with open(filepath, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        return list(reader)

# Função para juntar datasets
def join_datasets(teams_file, coaches_file):
    combined_dataset = []

    # Iterar pelas linhas do teams_file
    for team_row in teams_file:
        # Certificar-se de que team_row é um dicionário
        if not isinstance(team_row, dict):
            print(f"Erro: Esperado dicionário, mas obtido {type(team_row)}")
            continue

        # Procurar uma linha correspondente no coaches_file onde year e tmID coincidem
        matching_coach_row = next(
            (coach_row for coach_row in coaches_file 
             if coach_row['year'] == team_row['year'] and coach_row['tmID'] == team_row['tmID']), 
            None
        )

        # Se encontrar uma correspondência, juntar as duas linhas
        if matching_coach_row:
            # Caso tenha chaves duplicadas, renomear as chaves do coaches_file (opcional)
            renamed_coach_row = {f"coach_{key}": value for key, value in matching_coach_row.items()}
            
            combined_row = {**team_row, **renamed_coach_row}  # Combina os dicionários
            combined_dataset.append(combined_row)
        else:
            # Se não encontrar, apenas adicionar a linha do teams_file
            combined_dataset.append(team_row)
    
    return combined_dataset

# Determinar diretorias dos ficheiros
current_dir = os.getcwd()
datasets_dir = os.path.join(current_dir, 'datasets')

teams_path = os.path.join(datasets_dir, 'teams.csv')
coaches_path = os.path.join(datasets_dir, 'coaches.csv')

# Ler os datasets
try:
    teams_file = read_dataset(teams_path)
    coaches_file = read_dataset(coaches_path)

    # Verificar se a leitura resultou em dicionários
    if not teams_file or not isinstance(teams_file[0], dict):
        raise ValueError("O arquivo teams.csv não foi lido corretamente.")
    if not coaches_file or not isinstance(coaches_file[0], dict):
        raise ValueError("O arquivo coaches.csv não foi lido corretamente.")

    # Juntar os datasets
    combined_dataset = join_datasets(teams_file, coaches_file)

    # Exibir ou salvar o resultado
    for row in combined_dataset:
        print(row)

except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print(f"Error: {e}")


{'year': '9', 'lgID': 'WNBA', 'tmID': 'ATL', 'franchID': 'ATL', 'confID': 'EA', 'divID': '', 'rank': '7', 'playoff': 'N', 'seeded': '0', 'firstRound': '', 'semis': '', 'finals': '', 'name': 'Atlanta Dream', 'o_fgm': '895', 'o_fga': '2258', 'o_ftm': '542', 'o_fta': '725', 'o_3pm': '202', 'o_3pa': '598', 'o_oreb': '340', 'o_dreb': '737', 'o_reb': '1077', 'o_asts': '492', 'o_pf': '796', 'o_stl': '285', 'o_to': '593', 'o_blk': '142', 'o_pts': '2534', 'd_fgm': '1014', 'd_fga': '2254', 'd_ftm': '679', 'd_fta': '918', 'd_3pm': '172', 'd_3pa': '502', 'd_oreb': '401', 'd_dreb': '864', 'd_reb': '1265', 'd_asts': '684', 'd_pf': '726', 'd_stl': '310', 'd_to': '561', 'd_blk': '134', 'd_pts': '2879', 'tmORB': '0', 'tmDRB': '0', 'tmTRB': '0', 'opptmORB': '0', 'opptmDRB': '0', 'opptmTRB': '0', 'won': '4', 'lost': '30', 'GP': '34', 'homeW': '1', 'homeL': '16', 'awayW': '3', 'awayL': '14', 'confW': '2', 'confL': '18', 'min': '6825', 'attend': '141379', 'arena': 'Philips Arena', 'coach_coachID': 'meadoma