In [214]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df1 = pd.read_csv('datasets/players.csv')
df2 = pd.read_csv('datasets/players_teams.csv')
df3 = pd.read_csv('datasets/awards_players.csv')
df4 = pd.read_csv('datasets/teams.csv')
df5 = pd.read_csv('datasets/teams_post.csv')
df6 = pd.read_csv('datasets/coaches.csv')
df7 = pd.read_csv('datasets/series_post.csv')

def corrige_vencedor(teams, series_post):
    # Itera sobre cada rodada ('F', 'CF', 'FR') para ajustar cada fase dos playoffs
    for round_type in ['FR', 'CF', 'F']:
        # Filtra a série específica da rodada
        series_round = series_post[series_post['round'] == round_type]
        
        # Atualiza cada série individualmente
        for _, row in series_round.iterrows():
            year = row['year']
            winner_id = row['tmIDWinner']
            loser_id = row['tmIDLoser']
            
            # Define as colunas que correspondem às rodadas
            if round_type == 'FR':
                round_column = 'firstRound'
            elif round_type == 'CF':
                round_column = 'semis'
            elif round_type == 'F':
                round_column = 'finals'
            
            # Marca o time vencedor como "W" na rodada correspondente
            teams.loc[(teams['year'] == year) & (teams['tmID'] == winner_id), round_column] = 'W'
            
            # Marca o time perdedor como "L" na rodada correspondente
            teams.loc[(teams['year'] == year) & (teams['tmID'] == loser_id), round_column] = 'L'
    
    return teams

teams_file = corrige_vencedor(df4, df7)

players_teams_file = df2.drop(columns=['lgID'])
players_file = df1[df1['pos'].notna() & (df1['pos'] != '')]
players_file = players_file.drop(columns=['firstseason', 'lastseason', 'deathDate', 'collegeOther'])
players_file['college'] = players_file['college'].apply(lambda x: 1 if pd.notnull(x) else 0)
merged_df = pd.merge(players_teams_file, players_file, left_on='playerID', right_on='bioID', how='left')
merged_df = merged_df.drop(columns=['bioID'])
awards_players_file = df3.drop(columns=['lgID'])
teams_file = df4.drop(columns=['lgID', 'divID', 'tmORB','tmDRB','tmTRB','opptmORB','opptmDRB','opptmTRB','seeded'])
teams_file['playoff'] = teams_file['playoff'].apply(lambda x: 1 if x=='Y' else 0)

team_post_file = df5.drop(columns=['lgID'])
series_post_file = df7.drop(columns=['lgIDWinner', 'lgIDLoser'])
coaches_file = df6.drop(columns=['lgID'])


awards_grouped = awards_players_file.groupby(['playerID', 'year'])['award'].apply(list).reset_index()
awards_grouped['award'] = awards_grouped['award'].apply(lambda x: x if isinstance(x, list) else [])

merged_df = pd.merge(merged_df, awards_grouped, on=['playerID', 'year'], how='left')
merged_df['award'] = merged_df['award'].apply(lambda x: x if isinstance(x, list) else [])
merged_df = pd.merge(merged_df, teams_file, on=['tmID','year'], how = 'left')

merged_df = merged_df.drop(columns=['franchID', 'name'])

merged_df = pd.merge(merged_df, team_post_file, on=['tmID','year'], how = 'left')

In [215]:
player_count_per_team_year = merged_df.groupby(['year', 'tmID'])['playerID'].nunique().reset_index()
player_count_per_team_year.columns = ['Year', 'Team', 'PlayerCount']

# print(player_count_per_team_year)

# Calculate min, max, and average player count for each year
summary_stats = player_count_per_team_year.groupby('Year')['PlayerCount'].agg(['min', 'max', 'mean']).reset_index()
summary_stats.columns = ['Year', 'MinPlayerCount', 'MaxPlayerCount', 'AvgPlayerCount']

In [216]:
awards_coaches_file = df3.rename(columns={'playerID': 'coachID'})
coach_awards = awards_coaches_file[awards_coaches_file['award'] == 'Coach of the Year']
coach_awards_grouped = coach_awards.groupby(['coachID', 'year'])['award'].apply(list).reset_index()
coaches_file = pd.merge(coaches_file, coach_awards_grouped, on=['coachID', 'year'], how='left')

In [217]:
avg_oRebounds_by_pos = merged_df.groupby('pos')['oRebounds'].mean().reset_index()
avg_dRebounds_by_pos = merged_df.groupby('pos')['dRebounds'].mean().reset_index()

In [218]:
merged_df = merged_df.drop(columns=['rebounds', 'PostRebounds'])
merged_df = merged_df.rename(columns={'GP_x': 'GP_player', 'GP_y': 'GP_team'})

In [219]:
grouped = merged_df.groupby('year').agg({
    'o_pts': 'sum',
    'o_fga': 'sum',
    'o_oreb': 'sum',
    'o_to': 'sum',
    'o_fta': 'sum',
    'o_asts': 'sum',
    'o_fgm' : 'sum',
    'o_ftm': 'sum',
    'o_dreb':'sum',
}).reset_index()

grouped['VOP'] = grouped['o_pts'] / (grouped['o_fga'] - grouped['o_oreb'] + grouped['o_to'] + 0.44 * grouped['o_fta'])
grouped['factor'] = (2 / 3) - (0.5 * (grouped['o_asts'] / grouped['o_fgm'])) / (2 * (grouped['o_fgm'] / grouped['o_ftm']))
grouped['DRB%'] = (grouped['o_dreb'] - grouped['o_oreb']) / grouped['o_dreb']

uPER_df = merged_df.groupby(['playerID', 'year']).agg({
    'minutes': 'sum',     
    'threeMade': 'sum',   
    'assists': 'sum',     
    'fgMade': 'sum',      
    'ftMade': 'sum',      
    'turnovers': 'sum',   
    'fgAttempted': 'sum', 
    'ftAttempted': 'sum', 
    'dRebounds': 'sum',   
    'oRebounds': 'sum',   
    'steals': 'sum',      
    'blocks': 'sum',      
    'PF': 'sum'           
}).reset_index()

uPER_df = uPER_df.merge(grouped[['year', 'VOP', 'factor', 'DRB%']], on='year')

uPER_df['TRB'] = uPER_df['dRebounds'] + uPER_df['oRebounds']

uPER_df['uPER'] = (1 / uPER_df['minutes']) * (
    uPER_df['threeMade'] +
    (2/3) * uPER_df['assists'] +
    (2 - uPER_df['factor'] * (uPER_df['assists'] / uPER_df['fgMade'])) * uPER_df['fgMade'] +
    (uPER_df['ftMade'] * 0.5 * (1 + (1 - (uPER_df['assists'] / uPER_df['fgMade'])) + (2/3) * (uPER_df['assists'] / uPER_df['fgMade']))) -
    uPER_df['VOP'] * uPER_df['turnovers'] -
    uPER_df['VOP'] * uPER_df['DRB%'] * (uPER_df['fgAttempted'] - uPER_df['fgMade']) -
    uPER_df['VOP'] * 0.44 * (0.44 + (0.56 * uPER_df['DRB%'])) * (uPER_df['ftAttempted'] - uPER_df['ftMade']) +
    uPER_df['VOP'] * (1 - uPER_df['DRB%']) * uPER_df['TRB'] +
    uPER_df['VOP'] * uPER_df['DRB%'] * uPER_df['oRebounds'] +
    uPER_df['VOP'] * uPER_df['steals'] +
    uPER_df['VOP'] * uPER_df['DRB%'] * uPER_df['blocks'] -
    uPER_df['PF'] * ((grouped['o_ftm'].mean() / grouped['o_pts'].mean()) - 0.44 * (grouped['o_fta'].mean() / grouped['o_pts'].mean()) * uPER_df['VOP'])
)

lg_uPER = uPER_df.groupby('year')['uPER'].mean().reset_index()
lg_uPER.rename(columns={'uPER': 'lg_uPER'}, inplace=True)

uPER_df = uPER_df.merge(lg_uPER, on='year')

uPER_df['PER'] = uPER_df['uPER'] * (15 / uPER_df['lg_uPER'])


In [220]:
per_to_merge = uPER_df[['playerID', 'year', 'PER']]
merged_df = merged_df.merge(per_to_merge, on=['playerID', 'year'], how='left')

merged_df['TS%'] = (merged_df['points'] / (2 * (merged_df['fgAttempted'] + 0.44 * merged_df['ftAttempted'])))*100
merged_df['eFG%'] = ((merged_df['fgMade'] + 0.5 * merged_df['threeMade']) / merged_df['fgAttempted'])*100
merged_df['stocks'] = (merged_df['steals'] + merged_df['blocks'])

merged_df['PER'] = merged_df['PER'].fillna(0)
merged_df['TS%'] = merged_df['TS%'].fillna(0)
merged_df['eFG%'] = merged_df['eFG%'].fillna(0)
merged_df['stocks'] = merged_df['stocks'].fillna(0)

#Equipas que não foram aos playoffs
merged_df['W'] = merged_df['W'].fillna(0)
merged_df['L'] = merged_df['L'].fillna(0)

In [221]:
def box_plot_for_each_column(dataset):
    numeric_columns = dataset.select_dtypes(include='number')
    if numeric_columns.empty:
        print("No numeric columns found in the dataset.")
    else:
        numeric_columns.boxplot(figsize=(10, 6))
        plt.title("Boxplot for all numeric columns")
        plt.xticks(rotation=45)  # Rotation in x, if necessary
        plt.show()

def pearson_correlation(dataset, size_x, size_y):
    numeric_columns = dataset.select_dtypes(include='number')
    
    if numeric_columns.empty:
        print("Nenhuma coluna numérica encontrada no dataset.")
    else:
        # Correlation matrix
        correlation_matrix = numeric_columns.corr()

        # View
        plt.figure(figsize=(size_x, size_y))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Pearson-correlation')
        plt.show()

def bar_chart_for_each_column(dataset):
    non_numeric_columns = dataset.select_dtypes(exclude='number')
    if non_numeric_columns.empty:
        print("Any non-numeric columns found in the dataset.")
    else:
        for column in non_numeric_columns.columns:
            value_counts = non_numeric_columns[column].value_counts()
            plt.figure(figsize=(10, 6))
            value_counts.plot(kind='bar')
            plt.title(f"Bar chart for '{column}'")
            plt.xlabel(column)
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.tight_layout()  # Adjust layout to prevent overlap
            plt.show()

# Pie-chart for each column
def pie_chart_for_each_column(dataset):
    non_numeric_columns = dataset.select_dtypes(exclude='number')
    
    if non_numeric_columns.empty:
        print("Any non-numeric columns found in the dataset.")
    else:
        for column in non_numeric_columns.columns:
            # Count elements from different categories
            category_counts = dataset[column].value_counts()
            
            # Pie-chart
            plt.figure(figsize=(6, 6))
            category_counts.plot.pie(autopct='%1.1f%%', startangle=140)
            plt.title(f'Distribution of {column}')
            plt.ylabel('')  # Remove o rótulo do eixo Y
            plt.show()


#box_plot_for_each_column(merged_df)
#box_plot_for_each_column(coaches_file)
#box_plot_for_each_column(series_post_file)



In [222]:
#pearson_correlation(merged_df, 100, 80)
#pearson_correlation(coaches_file, 8, 6)
#pearson_correlation(series_post_file, 8, 6)

In [223]:
#bar_chart_for_each_column(merged_df)
#bar_chart_for_each_column(coaches_file)
#bar_chart_for_each_column(series_post_file)

In [224]:
#pie_chart_for_each_column(merged_df)
#pie_chart_for_each_column(coaches_file)
#pie_chart_for_each_column(series_post_file)

In [225]:
def replaceGameResults(column):
    return column.apply(lambda value: '100' if value == 'W' else '010' if value == 'L' else '001')

# Aplicar a função para cada coluna específica
merged_df['firstRound'] = replaceGameResults(merged_df['firstRound'])
merged_df['semis'] = replaceGameResults(merged_df['semis'])
merged_df['finals'] = replaceGameResults(merged_df['finals'])


In [226]:
if not os.path.exists('cleanDatasets'):
    os.makedirs('cleanDatasets')

merged_df.to_csv('cleanDatasets/players_and_teams.csv', index=False)
coaches_file.to_csv('cleanDatasets/coaches_and_awards.csv', index=False)
series_post_file.to_csv('cleanDatasets/series_post.csv', index=False)

In [227]:
merged_df2 = merged_df.drop(columns=['minutes','points','threeMade','assists','fgMade','turnovers','fgAttempted','ftAttempted','oRebounds','steals','blocks','PF','o_ftm','o_pts','o_fta','o_pts','o_fga','o_oreb','o_to','o_asts','o_fgm','o_dreb'])
merged_df2 = merged_df2.drop(columns=['GP_player','GS','ftMade','threeAttempted','GP_team'])
merged_df2 = merged_df2.drop(columns=['o_3pm','o_3pa','o_reb','o_pf','o_stl','o_blk','d_fgm','d_fga','d_ftm','d_fta','d_3pm','d_3pa','d_oreb','d_dreb','d_reb','d_asts','d_pf','d_stl','d_to','d_blk','d_pts'])
merged_df2 = merged_df2.drop(columns=['PostGP','PostGS','PostMinutes','PostPoints','PostoRebounds','PostdRebounds','PostAssists','PostSteals','PostBlocks','PostTurnovers','PostPF','PostfgAttempted','PostfgMade','PostftAttempted','PostftMade','PostthreeAttempted','PostthreeMade','PostDQ'])
merged_df2 = merged_df2.drop(columns=['arena'])

merged_df2['birthDate'] = pd.to_datetime(merged_df['birthDate'], errors='coerce').dt.year
merged_df2 = merged_df2.rename(columns={'birthDate': 'birthYear'})

if not os.path.exists('cleanDatasets'):
    os.makedirs('cleanDatasets')

merged_df2.to_csv('cleanDatasets/advancedstatistics.csv', index=False)

In [228]:
players_stats_prevYear = merged_df2[['playerID','year','PER', 'eFG%', 'TS%','stocks','dRebounds']].drop_duplicates().copy() #TODO Acrescentar aqui mais variaveis

players_stats_prevYear['year'] = players_stats_prevYear['year'] + 1


players_stats_prevYear = players_stats_prevYear.merge(
    merged_df2[['playerID', 'year', 'tmID', 'playoff']], 
    on=['playerID', 'year'], 
    how='left')

players_stats_prevYear.to_csv('cleanDatasets/players_stats_prevYear.csv', index=False)


In [229]:
#.............................Fazer a media por equipa dos valores mas pegando apenas nos 7 melhores jogadores...............

# Ordenar os jogadores dentro de cada equipe e ano com base no PER (ou outra métrica)
players_stats_prevYear_sorted = players_stats_prevYear.sort_values(by=['tmID', 'year', 'PER'], ascending=[True, True, False])

# Selecionar os 7 melhores jogadores de cada equipe e ano
top_7_players = players_stats_prevYear_sorted.groupby(['tmID', 'year']).head(5)

# Agora, calcular a média de PER, TS%, e eFG% apenas para os 7 melhores jogadores
team_year_stats = top_7_players.groupby(['tmID', 'year', 'playoff'])[['PER', 'TS%', 'eFG%','stocks','dRebounds']].mean().reset_index() #TODO Acrescentar aqui mais variavies


# Salva o novo dataset em um arquivo CSV
team_year_stats.to_csv('cleanDatasets/team_year_stats.csv', index=False)

In [230]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# Selecionar as colunas de interesse
features = team_year_stats[['PER', 'TS%', 'eFG%','stocks','dRebounds']] #TODO Acrescentar aqui tambem

# Normalizar os dados usando MinMaxScaler
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Aplicar o PCA
pca = PCA(n_components=3) #TODO mudar aqui o numero de colunas a selecionar
pca.fit(features_scaled)

# Verificar as cargas (coeficientes) dos componentes principais
components = pca.components_

# Baseado nas cargas, você pode decidir as variáveis mais importantes
# Vamos mostrar a importância de cada variável nas componentes principais

# Calcular a soma das cargas absolutas para cada variável
importance = pd.DataFrame(abs(components), columns=['PER', 'TS%', 'eFG%','stocks','dRebounds'], index=['PC1', 'PC2','PC3']) #TODO mudar aqui as variaveis
#TODO meter tantos PC quanto variaves a selecionar
importance_sum = importance.sum(axis=0)

# Selecionar as duas variáveis mais importantes
most_important_features = importance_sum.sort_values(ascending=False).head(3) #TODO mudar aqui o numero de variaveis

# Exibir apenas os nomes das variáveis mais importantes
important_variable_names = most_important_features.index.tolist()

# Inicializar a lista de componentes a serem removidos
components_to_drop = ['PER', 'TS%', 'eFG%','stocks','dRebounds'] #TODO mudar aqui as variavies

# Remover as variáveis mais importantes da lista de componentes a serem removidos
components_to_drop = [col for col in components_to_drop if col not in important_variable_names]

team_year_stats=team_year_stats.drop(columns=components_to_drop)

# Exibir o resultado
print(team_year_stats)


    tmID  year  playoff        PER       eFG%  stocks
0    ATL     9      0.0  19.894077  50.207877    26.6
1    ATL    10      1.0  21.751629  51.579845    49.6
2    CHA     2      1.0  20.571617  47.843485    29.0
3    CHA     3      1.0  19.356376  47.204217    36.4
4    CHA     4      1.0  20.989740  46.980211    37.6
..   ...   ...      ...        ...        ...     ...
121  WAS     6      0.0  17.387332  46.823234    46.8
122  WAS     7      1.0  20.574946  51.330317    39.4
123  WAS     8      0.0  21.686321  55.716693    47.8
124  WAS     9      0.0  21.568455  45.065012    47.0
125  WAS    10      1.0  19.989199  51.201160    32.2

[126 rows x 6 columns]


In [231]:
#Dividir o dataset para treino, validacao e teste 

dataset_treino = team_year_stats[(team_year_stats['year'] >= 7) & (team_year_stats['year'] <= 9)]
dataset_teste = team_year_stats[team_year_stats['year'] == 10]

In [232]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Features (X) e alvo (y)
X_treino = dataset_treino[important_variable_names] #Antigamente 'PER', 'TS%', 'eFG%'
y_treino = dataset_treino['playoff']

X_teste = dataset_teste[important_variable_names]
y_teste = dataset_teste['playoff']

In [233]:
# Inicializar o modelo de regressão
modelo = RandomForestRegressor(random_state=42)

# Treinar com os dados de treino
modelo.fit(X_treino, y_treino)

y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

print("Mean Squared Error (MSE) na validação:", mse)
print("R² score na validação:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)

Mean Squared Error (MSE) na validação: 0.17463076923076923
R² score na validação: 0.2621850000000001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.85
11,CHI,10,0.0,20.025257,46.367767,55.0,0.73
21,CON,10,0.0,22.530104,49.441427,35.0,0.23
30,DET,10,1.0,22.996364,44.634029,32.0,0.71
47,IND,10,1.0,20.581403,48.096441,56.8,0.6
56,LAS,10,1.0,24.575351,47.997875,78.4,0.89
67,MIN,10,0.0,23.213399,46.49535,64.0,0.77
76,NYL,10,0.0,21.074549,51.153114,38.0,0.52
87,PHO,10,1.0,23.627958,51.400423,45.2,0.91
98,SAC,10,0.0,20.71758,49.564481,37.8,0.31


In [235]:
# Inicializar o modelo de regressão linear
modelo = LinearRegression()

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)

Mean Squared Error (MSE) no teste: 0.2991158585788188
R² score no teste: -0.2637645024955091


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.724031
11,CHI,10,0.0,20.025257,46.367767,55.0,0.817805
21,CON,10,0.0,22.530104,49.441427,35.0,0.471213
30,DET,10,1.0,22.996364,44.634029,32.0,0.446601
47,IND,10,1.0,20.581403,48.096441,56.8,0.85495
56,LAS,10,1.0,24.575351,47.997875,78.4,1.348043
67,MIN,10,0.0,23.213399,46.49535,64.0,1.053017
76,NYL,10,0.0,21.074549,51.153114,38.0,0.490997
87,PHO,10,1.0,23.627958,51.400423,45.2,0.67852
98,SAC,10,0.0,20.71758,49.564481,37.8,0.487723


In [236]:
#::::::::::::::::::::Este modelo é uma variante da regressão linear que usa regularização L2 para reduzir overfitting.::::::::::::::::

# Inicializar o modelo de Ridge Regression
modelo = Ridge(alpha=1.0)  # alpha controla o nível de regularização

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)

Mean Squared Error (MSE) no teste: 0.2990071894980566
R² score no teste: -0.2633053756292889


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.724131
11,CHI,10,0.0,20.025257,46.367767,55.0,0.817828
21,CON,10,0.0,22.530104,49.441427,35.0,0.470975
30,DET,10,1.0,22.996364,44.634029,32.0,0.445919
47,IND,10,1.0,20.581403,48.096441,56.8,0.855013
56,LAS,10,1.0,24.575351,47.997875,78.4,1.347499
67,MIN,10,0.0,23.213399,46.49535,64.0,1.052543
76,NYL,10,0.0,21.074549,51.153114,38.0,0.49114
87,PHO,10,1.0,23.627958,51.400423,45.2,0.678276
98,SAC,10,0.0,20.71758,49.564481,37.8,0.487808


In [237]:
# Similar ao Ridge, mas utiliza regularização L1. Tende a eliminar variáveis menos importantes, útil para seleção de features.
# Inicializar o modelo de Lasso Regression
modelo = Lasso(alpha=0.1, random_state=42)  # alpha controla a força da regularização

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)


Mean Squared Error (MSE) no teste: 0.2894247633941618
R² score no teste: -0.22281962534033317


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.719801
11,CHI,10,0.0,20.025257,46.367767,55.0,0.825197
21,CON,10,0.0,22.530104,49.441427,35.0,0.43484
30,DET,10,1.0,22.996364,44.634029,32.0,0.376287
47,IND,10,1.0,20.581403,48.096441,56.8,0.860329
56,LAS,10,1.0,24.575351,47.997875,78.4,1.281915
67,MIN,10,0.0,23.213399,46.49535,64.0,1.000858
76,NYL,10,0.0,21.074549,51.153114,38.0,0.493394
87,PHO,10,1.0,23.627958,51.400423,45.2,0.633922
98,SAC,10,0.0,20.71758,49.564481,37.8,0.48949


In [238]:
# Combina as regularizações L1 (Lasso) e L2 (Ridge).
# Inicializar o modelo de Elastic Net
modelo = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
# `alpha` controla a força total da regularização.
# `l1_ratio` controla a proporção de regularização L1 (Lasso) em relação à L2 (Ridge).

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)

Mean Squared Error (MSE) no teste: 0.2918713010270366
R² score no teste: -0.23315624683922942


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.726309
11,CHI,10,0.0,20.025257,46.367767,55.0,0.824026
21,CON,10,0.0,22.530104,49.441427,35.0,0.444251
30,DET,10,1.0,22.996364,44.634029,32.0,0.3877
47,IND,10,1.0,20.581403,48.096441,56.8,0.861795
56,LAS,10,1.0,24.575351,47.997875,78.4,1.302718
67,MIN,10,0.0,23.213399,46.49535,64.0,1.014743
76,NYL,10,0.0,21.074549,51.153114,38.0,0.496258
87,PHO,10,1.0,23.627958,51.400423,45.2,0.648845
98,SAC,10,0.0,20.71758,49.564481,37.8,0.490706


In [239]:
# Inicializar o modelo de MLP
modelo = MLPRegressor(hidden_layer_sizes=(100, 50),  # Camadas ocultas com 100 e 50 neurônios
                      activation='relu',            # Função de ativação
                      solver='adam',                # Otimizador
                      max_iter=500,                 # Número máximo de iterações
                      random_state=42)

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)


Mean Squared Error (MSE) no teste: 0.29491115884761465
R² score no teste: -0.24599964613117153


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.486096
11,CHI,10,0.0,20.025257,46.367767,55.0,0.564844
21,CON,10,0.0,22.530104,49.441427,35.0,0.492281
30,DET,10,1.0,22.996364,44.634029,32.0,0.604798
47,IND,10,1.0,20.581403,48.096441,56.8,0.586536
56,LAS,10,1.0,24.575351,47.997875,78.4,0.937301
67,MIN,10,0.0,23.213399,46.49535,64.0,0.719645
76,NYL,10,0.0,21.074549,51.153114,38.0,0.364988
87,PHO,10,1.0,23.627958,51.400423,45.2,0.532346
98,SAC,10,0.0,20.71758,49.564481,37.8,0.384746


In [240]:
import numpy as np

# Inicializar o modelo sequencial
modelo = Sequential()

# Adicionar camadas
modelo.add(Dense(64, activation='relu', input_shape=(X_treino.shape[1],)))  # Primeira camada oculta
modelo.add(Dense(32, activation='relu'))  # Segunda camada oculta
modelo.add(Dense(1))  # Camada de saída (regressão)

# Compilar o modelo
modelo.compile(optimizer='adam', loss='mse')

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino, epochs=50, batch_size=32, verbose=1, validation_split=0.2)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste).flatten()  # Flatten para transformar em 1D

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 908ms/step - loss: 36.5075 - val_loss: 18.7138
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 17.6412 - val_loss: 6.5827
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 5.9521 - val_loss: 1.1697
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.9093 - val_loss: 1.0204
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 1.0321 - val_loss: 3.7278
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 3.9182 - val_loss: 6.6547
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 6.9341 - val_loss: 8.1707
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 8.4709 - val_loss: 7.9440
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.369827
11,CHI,10,0.0,20.025257,46.367767,55.0,0.347885
21,CON,10,0.0,22.530104,49.441427,35.0,0.066324
30,DET,10,1.0,22.996364,44.634029,32.0,-0.104109
47,IND,10,1.0,20.581403,48.096441,56.8,0.373462
56,LAS,10,1.0,24.575351,47.997875,78.4,0.63812
67,MIN,10,0.0,23.213399,46.49535,64.0,0.237579
76,NYL,10,0.0,21.074549,51.153114,38.0,0.262559
87,PHO,10,1.0,23.627958,51.400423,45.2,0.127816
98,SAC,10,0.0,20.71758,49.564481,37.8,0.234937


In [241]:
# Inicializar o modelo de regressão com Extra Trees
modelo = ExtraTreesRegressor(random_state=42)

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)

Mean Squared Error (MSE) no teste: 0.22907692307692307
R² score no teste: 0.032150000000000234


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.91
11,CHI,10,0.0,20.025257,46.367767,55.0,0.69
21,CON,10,0.0,22.530104,49.441427,35.0,0.22
30,DET,10,1.0,22.996364,44.634029,32.0,0.86
47,IND,10,1.0,20.581403,48.096441,56.8,0.45
56,LAS,10,1.0,24.575351,47.997875,78.4,0.94
67,MIN,10,0.0,23.213399,46.49535,64.0,0.89
76,NYL,10,0.0,21.074549,51.153114,38.0,0.77
87,PHO,10,1.0,23.627958,51.400423,45.2,0.86
98,SAC,10,0.0,20.71758,49.564481,37.8,0.45


In [242]:
# Inicializar o modelo ElasticNet
modelo = ElasticNet(random_state=42)

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
print(dataset_teste.head(15))


Mean Squared Error (MSE) no teste: 0.2648523573353825
R² score no teste: -0.1190012097419908
    tmID  year  playoff        PER       eFG%  stocks  playoff_previsto
1    ATL    10      1.0  21.751629  51.579845    49.6          0.689283
11   CHI    10      0.0  20.025257  46.367767    55.0          0.770754
21   CON    10      0.0  22.530104  49.441427    35.0          0.469011
30   DET    10      1.0  22.996364  44.634029    32.0          0.423749
47   IND    10      1.0  20.581403  48.096441    56.8          0.797911
56   LAS    10      1.0  24.575351  47.997875    78.4          1.123794
67   MIN    10      0.0  23.213399  46.495350    64.0          0.906539
76   NYL    10      0.0  21.074549  51.153114    38.0          0.514272
87   PHO    10      1.0  23.627958  51.400423    45.2          0.622900
98   SAC    10      0.0  20.717580  49.564481    37.8          0.511255
105  SAS    10      1.0  19.340974  51.177129    47.4          0.656092
114  SEA    10      1.0  19.478406  47.9130

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


In [243]:
# Inicializar o modelo XGBoost Regressor
modelo = xgb.XGBRegressor(random_state=42)

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)


Mean Squared Error (MSE) no teste: 0.17809230151985936
R² score no teste: 0.2475600260785944


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,1.003988
11,CHI,10,0.0,20.025257,46.367767,55.0,0.549502
21,CON,10,0.0,22.530104,49.441427,35.0,0.218301
30,DET,10,1.0,22.996364,44.634029,32.0,0.832316
47,IND,10,1.0,20.581403,48.096441,56.8,0.557558
56,LAS,10,1.0,24.575351,47.997875,78.4,0.980005
67,MIN,10,0.0,23.213399,46.49535,64.0,0.793431
76,NYL,10,0.0,21.074549,51.153114,38.0,0.416923
87,PHO,10,1.0,23.627958,51.400423,45.2,0.985964
98,SAC,10,0.0,20.71758,49.564481,37.8,0.277197


In [244]:
# Inicializar o modelo LightGBM Regressor
modelo = lgb.LGBMRegressor(random_state=42)

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head(15)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 2
[LightGBM] [Info] Start training from score 0.585366
Mean Squared Error (MSE) no teste: 0.24088886985283134
R² score no teste: -0.017755475128212073


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.676715
11,CHI,10,0.0,20.025257,46.367767,55.0,0.498367
21,CON,10,0.0,22.530104,49.441427,35.0,0.676715
30,DET,10,1.0,22.996364,44.634029,32.0,0.587541
47,IND,10,1.0,20.581403,48.096441,56.8,0.587541
56,LAS,10,1.0,24.575351,47.997875,78.4,0.676715
67,MIN,10,0.0,23.213399,46.49535,64.0,0.587541
76,NYL,10,0.0,21.074549,51.153114,38.0,0.676715
87,PHO,10,1.0,23.627958,51.400423,45.2,0.676715
98,SAC,10,0.0,20.71758,49.564481,37.8,0.676715


In [252]:
# Inicializar o modelo de regressão com Gradient Boosting
modelo = GradientBoostingRegressor(random_state=42) #TODO porque foi o melhor modelo até agora

# Treinar o modelo com os dados de treino
modelo.fit(X_treino, y_treino)

# Fazer previsões no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_teste, y_pred_teste)
r2 = r2_score(y_teste, y_pred_teste)

# Exibir as métricas
print("Mean Squared Error (MSE) no teste:", mse)
print("R² score no teste:", r2)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Critério de correção
# 1. Quando 'playoff' é 1, 'playoff_previsto' deve ser > 0.5
# 2. Quando 'playoff' é 0, 'playoff_previsto' deve ser <= 0.5

# Criar uma coluna indicando se a previsão está correta
dataset_teste['correto'] = ((dataset_teste['playoff'] == 1) & (dataset_teste['playoff_previsto'] > 0.5)) | \
                           ((dataset_teste['playoff'] == 0) & (dataset_teste['playoff_previsto'] <= 0.5))

# Contar o número de previsões corretas
corretos = dataset_teste['correto'].sum()

# Número total de exemplos
total = len(dataset_teste)

# Exibir o resultado
print(f"Previsões corretas: {corretos} de {total} ({(corretos / total) * 100:.2f}%)")


# Exibir as primeiras linhas para verificar
dataset_teste.head(15)


Mean Squared Error (MSE) no teste: 0.16150240458566234
R² score no teste: 0.3176523406255768
Previsões corretas: 11 de 13 (84.62%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['correto'] = ((dataset_teste['playoff'] == 1) & (dataset_teste['playoff_previsto'] > 0.5)) | \


Unnamed: 0,tmID,year,playoff,PER,eFG%,stocks,playoff_previsto,correto
1,ATL,10,1.0,21.751629,51.579845,49.6,0.968918,True
11,CHI,10,0.0,20.025257,46.367767,55.0,0.823756,False
21,CON,10,0.0,22.530104,49.441427,35.0,0.29249,True
30,DET,10,1.0,22.996364,44.634029,32.0,1.051501,True
47,IND,10,1.0,20.581403,48.096441,56.8,0.626726,True
56,LAS,10,1.0,24.575351,47.997875,78.4,0.895035,True
67,MIN,10,0.0,23.213399,46.49535,64.0,0.775672,False
76,NYL,10,0.0,21.074549,51.153114,38.0,0.457772,True
87,PHO,10,1.0,23.627958,51.400423,45.2,0.949047,True
98,SAC,10,0.0,20.71758,49.564481,37.8,0.424344,True
