In [99]:
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df1 = pd.read_csv('datasets/players.csv')
df2 = pd.read_csv('datasets/players_teams.csv')
df3 = pd.read_csv('datasets/awards_players.csv')
df4 = pd.read_csv('datasets/teams.csv')
df5 = pd.read_csv('datasets/teams_post.csv')
df6 = pd.read_csv('datasets/coaches.csv')
df7 = pd.read_csv('datasets/series_post.csv')

def corrige_vencedor(teams, series_post):
    # Itera sobre cada rodada ('F', 'CF', 'FR') para ajustar cada fase dos playoffs
    for round_type in ['FR', 'CF', 'F']:
        # Filtra a série específica da rodada
        series_round = series_post[series_post['round'] == round_type]
        
        # Atualiza cada série individualmente
        for _, row in series_round.iterrows():
            year = row['year']
            winner_id = row['tmIDWinner']
            loser_id = row['tmIDLoser']
            
            # Define as colunas que correspondem às rodadas
            if round_type == 'FR':
                round_column = 'firstRound'
            elif round_type == 'CF':
                round_column = 'semis'
            elif round_type == 'F':
                round_column = 'finals'
            
            # Marca o time vencedor como "W" na rodada correspondente
            teams.loc[(teams['year'] == year) & (teams['tmID'] == winner_id), round_column] = 'W'
            
            # Marca o time perdedor como "L" na rodada correspondente
            teams.loc[(teams['year'] == year) & (teams['tmID'] == loser_id), round_column] = 'L'
    
    return teams

teams_file = corrige_vencedor(df4, df7)

players_teams_file = df2.drop(columns=['lgID'])
players_file = df1[df1['pos'].notna() & (df1['pos'] != '')]
players_file = players_file.drop(columns=['firstseason', 'lastseason', 'deathDate', 'collegeOther'])
players_file['college'] = players_file['college'].apply(lambda x: 1 if pd.notnull(x) else 0)
merged_df = pd.merge(players_teams_file, players_file, left_on='playerID', right_on='bioID', how='left')
merged_df = merged_df.drop(columns=['bioID'])
awards_players_file = df3.drop(columns=['lgID'])
teams_file = df4.drop(columns=['lgID', 'divID', 'tmORB','tmDRB','tmTRB','opptmORB','opptmDRB','opptmTRB','seeded'])
teams_file['playoff'] = teams_file['playoff'].apply(lambda x: 1 if x=='Y' else 0)

team_post_file = df5.drop(columns=['lgID'])
series_post_file = df7.drop(columns=['lgIDWinner', 'lgIDLoser'])
coaches_file = df6.drop(columns=['lgID'])


awards_grouped = awards_players_file.groupby(['playerID', 'year'])['award'].apply(list).reset_index()
awards_grouped['award'] = awards_grouped['award'].apply(lambda x: x if isinstance(x, list) else [])

merged_df = pd.merge(merged_df, awards_grouped, on=['playerID', 'year'], how='left')
merged_df['award'] = merged_df['award'].apply(lambda x: x if isinstance(x, list) else [])
merged_df = pd.merge(merged_df, teams_file, on=['tmID','year'], how = 'left')

merged_df = merged_df.drop(columns=['franchID', 'name'])

merged_df = pd.merge(merged_df, team_post_file, on=['tmID','year'], how = 'left')

print(merged_df)
merged_df.head()

        playerID  year  stint tmID  GP_x  GS  minutes  points  oRebounds  \
0     abrossv01w     2      0  MIN    26  23      846     343         43   
1     abrossv01w     3      0  MIN    27  27      805     314         45   
2     abrossv01w     4      0  MIN    30  25      792     318         44   
3     abrossv01w     5      0  MIN    22  11      462     146         17   
4     abrossv01w     6      0  MIN    31  31      777     304         29   
...          ...   ...    ...  ...   ...  ..      ...     ...        ...   
1871  zakalok01w     3      2  PHO     5   0       37       6          0   
1872   zarafr01w     6      0  SEA    34   4      413      90         11   
1873  zellosh01w    10      0  DET    34   4      802     406         25   
1874  zirkozu01w     4      0  WAS     6   0       30      11          0   
1875   zollsh01w     9      0  MIN     6   0       30      10          1   

      dRebounds  ...  homeL  awayW  awayL  confW  confL   min  attend  \
0           13

Unnamed: 0,playerID,year,stint,tmID,GP_x,GS,minutes,points,oRebounds,dRebounds,...,homeL,awayW,awayL,confW,confL,min,attend,arena,W,L
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,10,6,10,9,12,6475,120607,Target Center,,
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,9,3,13,6,15,6475,139874,Target Center,,
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,6,7,10,14,10,6850,120253,Target Center,1.0,2.0
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,6,7,10,12,10,6850,125097,Target Center,0.0,2.0
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,6,3,14,9,13,6850,113447,Target Center,,


In [100]:
player_count_per_team_year = merged_df.groupby(['year', 'tmID'])['playerID'].nunique().reset_index()
player_count_per_team_year.columns = ['Year', 'Team', 'PlayerCount']

# print(player_count_per_team_year)

# Calculate min, max, and average player count for each year
summary_stats = player_count_per_team_year.groupby('Year')['PlayerCount'].agg(['min', 'max', 'mean']).reset_index()
summary_stats.columns = ['Year', 'MinPlayerCount', 'MaxPlayerCount', 'AvgPlayerCount']

print(summary_stats)

   Year  MinPlayerCount  MaxPlayerCount  AvgPlayerCount
0     1              11              15       13.000000
1     2              11              18       13.000000
2     3              11              15       13.562500
3     4              11              21       12.857143
4     5              11              15       12.846154
5     6              12              16       13.692308
6     7              10              16       12.857143
7     8              10              15       13.307692
8     9              12              19       14.285714
9    10               9              18       12.692308


In [101]:
awards_coaches_file = df3.rename(columns={'playerID': 'coachID'})
coach_awards = awards_coaches_file[awards_coaches_file['award'] == 'Coach of the Year']
coach_awards_grouped = coach_awards.groupby(['coachID', 'year'])['award'].apply(list).reset_index()
coaches_file = pd.merge(coaches_file, coach_awards_grouped, on=['coachID', 'year'], how='left')

print(coaches_file)
coaches_file.head(50)

        coachID  year tmID  stint  won  lost  post_wins  post_losses award
0    adamsmi01w     5  WAS      0   17    17          1            2   NaN
1    adubari99w     1  NYL      0   20    12          4            3   NaN
2    adubari99w     2  NYL      0   21    11          3            3   NaN
3    adubari99w     3  NYL      0   18    14          4            4   NaN
4    adubari99w     4  NYL      0   16    18          0            0   NaN
..          ...   ...  ...    ...  ...   ...        ...          ...   ...
157  wintebr01w     6  IND      0   21    13          2            2   NaN
158  wintebr01w     7  IND      0   21    13          0            2   NaN
159  wintebr01w     8  IND      0   21    13          3            3   NaN
160  zierddo99w     8  MIN      0   10    24          0            0   NaN
161  zierddo99w     9  MIN      0   16    18          0            0   NaN

[162 rows x 9 columns]


Unnamed: 0,coachID,year,tmID,stint,won,lost,post_wins,post_losses,award
0,adamsmi01w,5,WAS,0,17,17,1,2,
1,adubari99w,1,NYL,0,20,12,4,3,
2,adubari99w,2,NYL,0,21,11,3,3,
3,adubari99w,3,NYL,0,18,14,4,4,
4,adubari99w,4,NYL,0,16,18,0,0,
5,adubari99w,5,NYL,1,7,9,0,0,
6,adubari99w,6,WAS,0,16,18,0,0,
7,adubari99w,7,WAS,0,18,16,0,2,
8,adubari99w,8,WAS,1,0,4,0,0,
9,aglerbr99w,1,MIN,0,15,17,0,0,


In [102]:
avg_oRebounds_by_pos = merged_df.groupby('pos')['oRebounds'].mean().reset_index()
avg_dRebounds_by_pos = merged_df.groupby('pos')['dRebounds'].mean().reset_index()

print('--------------------')
print(avg_oRebounds_by_pos)
print('--------------------')
print(avg_dRebounds_by_pos)

--------------------
   pos  oRebounds
0    C  27.624031
1  C-F  46.208333
2    F  30.177606
3  F-C  39.971751
4  F-G  23.400000
5    G  13.301973
6  G-F  22.674699
--------------------
   pos  dRebounds
0    C  59.895349
1  C-F  68.395833
2    F  62.218147
3  F-C  73.440678
4  F-G  71.220000
5    G  38.188164
6  G-F  55.668675


In [103]:
merged_df = merged_df.drop(columns=['rebounds', 'PostRebounds'])
merged_df = merged_df.rename(columns={'GP_x': 'GP_player', 'GP_y': 'GP_team'})

In [104]:
grouped = merged_df.groupby('year').agg({
    'o_pts': 'sum',
    'o_fga': 'sum',
    'o_oreb': 'sum',
    'o_to': 'sum',
    'o_fta': 'sum',
    'o_asts': 'sum',
    'o_fgm' : 'sum',
    'o_ftm': 'sum',
    'o_dreb':'sum',
}).reset_index()

grouped['VOP'] = grouped['o_pts'] / (grouped['o_fga'] - grouped['o_oreb'] + grouped['o_to'] + 0.44 * grouped['o_fta'])
grouped['factor'] = (2 / 3) - (0.5 * (grouped['o_asts'] / grouped['o_fgm'])) / (2 * (grouped['o_fgm'] / grouped['o_ftm']))
grouped['DRB%'] = (grouped['o_dreb'] - grouped['o_oreb']) / grouped['o_dreb']

uPER_df = merged_df.groupby(['playerID', 'year']).agg({
    'minutes': 'sum',     
    'threeMade': 'sum',   
    'assists': 'sum',     
    'fgMade': 'sum',      
    'ftMade': 'sum',      
    'turnovers': 'sum',   
    'fgAttempted': 'sum', 
    'ftAttempted': 'sum', 
    'dRebounds': 'sum',   
    'oRebounds': 'sum',   
    'steals': 'sum',      
    'blocks': 'sum',      
    'PF': 'sum'           
}).reset_index()

uPER_df = uPER_df.merge(grouped[['year', 'VOP', 'factor', 'DRB%']], on='year')

uPER_df['TRB'] = uPER_df['dRebounds'] + uPER_df['oRebounds']

uPER_df['uPER'] = (1 / uPER_df['minutes']) * (
    uPER_df['threeMade'] +
    (2/3) * uPER_df['assists'] +
    (2 - uPER_df['factor'] * (uPER_df['assists'] / uPER_df['fgMade'])) * uPER_df['fgMade'] +
    (uPER_df['ftMade'] * 0.5 * (1 + (1 - (uPER_df['assists'] / uPER_df['fgMade'])) + (2/3) * (uPER_df['assists'] / uPER_df['fgMade']))) -
    uPER_df['VOP'] * uPER_df['turnovers'] -
    uPER_df['VOP'] * uPER_df['DRB%'] * (uPER_df['fgAttempted'] - uPER_df['fgMade']) -
    uPER_df['VOP'] * 0.44 * (0.44 + (0.56 * uPER_df['DRB%'])) * (uPER_df['ftAttempted'] - uPER_df['ftMade']) +
    uPER_df['VOP'] * (1 - uPER_df['DRB%']) * uPER_df['TRB'] +
    uPER_df['VOP'] * uPER_df['DRB%'] * uPER_df['oRebounds'] +
    uPER_df['VOP'] * uPER_df['steals'] +
    uPER_df['VOP'] * uPER_df['DRB%'] * uPER_df['blocks'] -
    uPER_df['PF'] * ((grouped['o_ftm'].mean() / grouped['o_pts'].mean()) - 0.44 * (grouped['o_fta'].mean() / grouped['o_pts'].mean()) * uPER_df['VOP'])
)

lg_uPER = uPER_df.groupby('year')['uPER'].mean().reset_index()
lg_uPER.rename(columns={'uPER': 'lg_uPER'}, inplace=True)

uPER_df = uPER_df.merge(lg_uPER, on='year')

uPER_df['PER'] = uPER_df['uPER'] * (15 / uPER_df['lg_uPER'])

print(uPER_df[['playerID', 'year', 'uPER', 'PER']])


        playerID  year      uPER        PER
0     abrossv01w     2  0.349958  19.293262
1     abrossv01w     3  0.295415  15.898093
2     abrossv01w     4  0.328919  18.485960
3     abrossv01w     5  0.258967  14.545585
4     abrossv01w     6  0.303834  18.086335
...          ...   ...       ...        ...
1800  zakalok01w     3  0.039552   2.128553
1801   zarafr01w     6  0.142371   8.474937
1802  zellosh01w    10  0.370957  17.561618
1803  zirkozu01w     4  0.277055  15.571097
1804   zollsh01w     9  0.174661   8.814235

[1805 rows x 4 columns]


In [105]:
per_to_merge = uPER_df[['playerID', 'year', 'PER']]
merged_df = merged_df.merge(per_to_merge, on=['playerID', 'year'], how='left')

merged_df['TS%'] = (merged_df['points'] / (2 * (merged_df['fgAttempted'] + 0.44 * merged_df['ftAttempted'])))*100
merged_df['eFG%'] = ((merged_df['fgMade'] + 0.5 * merged_df['threeMade']) / merged_df['fgAttempted'])*100
merged_df['stocks'] = (merged_df['steals'] + merged_df['blocks'])

merged_df['PER'] = merged_df['PER'].fillna(0)
merged_df['TS%'] = merged_df['TS%'].fillna(0)
merged_df['eFG%'] = merged_df['eFG%'].fillna(0)
merged_df['stocks'] = merged_df['stocks'].fillna(0)

#Equipas que não foram aos playoffs
merged_df['W'] = merged_df['W'].fillna(0)
merged_df['L'] = merged_df['L'].fillna(0)


merged_df.head()

Unnamed: 0,playerID,year,stint,tmID,GP_player,GS,minutes,points,oRebounds,dRebounds,...,confL,min,attend,arena,W,L,PER,TS%,eFG%,stocks
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,12,6475,120607,Target Center,0.0,0.0,19.293262,48.849265,42.150171,51
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,15,6475,139874,Target Center,0.0,0.0,15.898093,42.774629,40.822785,52
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,10,6850,120253,Target Center,1.0,2.0,18.48596,48.457881,43.684211,55
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,10,6850,125097,Target Center,0.0,2.0,14.545585,45.842753,42.446043,32
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,13,6850,113447,Target Center,0.0,0.0,18.086335,49.331429,45.471014,54


In [106]:
def box_plot_for_each_column(dataset):
    numeric_columns = dataset.select_dtypes(include='number')
    if numeric_columns.empty:
        print("No numeric columns found in the dataset.")
    else:
        numeric_columns.boxplot(figsize=(10, 6))
        plt.title("Boxplot for all numeric columns")
        plt.xticks(rotation=45)  # Rotation in x, if necessary
        plt.show()

def pearson_correlation(dataset, size_x, size_y):
    numeric_columns = dataset.select_dtypes(include='number')
    
    if numeric_columns.empty:
        print("Nenhuma coluna numérica encontrada no dataset.")
    else:
        # Correlation matrix
        correlation_matrix = numeric_columns.corr()

        # View
        plt.figure(figsize=(size_x, size_y))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Pearson-correlation')
        plt.show()

def bar_chart_for_each_column(dataset):
    non_numeric_columns = dataset.select_dtypes(exclude='number')
    if non_numeric_columns.empty:
        print("Any non-numeric columns found in the dataset.")
    else:
        for column in non_numeric_columns.columns:
            value_counts = non_numeric_columns[column].value_counts()
            plt.figure(figsize=(10, 6))
            value_counts.plot(kind='bar')
            plt.title(f"Bar chart for '{column}'")
            plt.xlabel(column)
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.tight_layout()  # Adjust layout to prevent overlap
            plt.show()

# Pie-chart for each column
def pie_chart_for_each_column(dataset):
    non_numeric_columns = dataset.select_dtypes(exclude='number')
    
    if non_numeric_columns.empty:
        print("Any non-numeric columns found in the dataset.")
    else:
        for column in non_numeric_columns.columns:
            # Count elements from different categories
            category_counts = dataset[column].value_counts()
            
            # Pie-chart
            plt.figure(figsize=(6, 6))
            category_counts.plot.pie(autopct='%1.1f%%', startangle=140)
            plt.title(f'Distribution of {column}')
            plt.ylabel('')  # Remove o rótulo do eixo Y
            plt.show()


#box_plot_for_each_column(merged_df)
#box_plot_for_each_column(coaches_file)
#box_plot_for_each_column(series_post_file)



In [107]:
#pearson_correlation(merged_df, 100, 80)
#pearson_correlation(coaches_file, 8, 6)
#pearson_correlation(series_post_file, 8, 6)

In [108]:
#bar_chart_for_each_column(merged_df)
#bar_chart_for_each_column(coaches_file)
#bar_chart_for_each_column(series_post_file)

In [109]:
#pie_chart_for_each_column(merged_df)
#pie_chart_for_each_column(coaches_file)
#pie_chart_for_each_column(series_post_file)

In [110]:
def replaceGameResults(column):
    return column.apply(lambda value: '100' if value == 'W' else '010' if value == 'L' else '001')

# Aplicar a função para cada coluna específica
merged_df['firstRound'] = replaceGameResults(merged_df['firstRound'])
merged_df['semis'] = replaceGameResults(merged_df['semis'])
merged_df['finals'] = replaceGameResults(merged_df['finals'])


In [111]:
if not os.path.exists('cleanDatasets'):
    os.makedirs('cleanDatasets')

merged_df.to_csv('cleanDatasets/players_and_teams.csv', index=False)
coaches_file.to_csv('cleanDatasets/coaches_and_awards.csv', index=False)
series_post_file.to_csv('cleanDatasets/series_post.csv', index=False)

In [112]:
merged_df2 = merged_df.drop(columns=['minutes','points','threeMade','assists','fgMade','turnovers','fgAttempted','ftAttempted','oRebounds','steals','blocks','PF','o_ftm','o_pts','o_fta','o_pts','o_fga','o_oreb','o_to','o_asts','o_fgm','o_dreb'])
merged_df2 = merged_df2.drop(columns=['GP_player','GS','ftMade','threeAttempted','GP_team'])
merged_df2 = merged_df2.drop(columns=['o_3pm','o_3pa','o_reb','o_pf','o_stl','o_blk','d_fgm','d_fga','d_ftm','d_fta','d_3pm','d_3pa','d_oreb','d_dreb','d_reb','d_asts','d_pf','d_stl','d_to','d_blk','d_pts'])
merged_df2 = merged_df2.drop(columns=['PostGP','PostGS','PostMinutes','PostPoints','PostoRebounds','PostdRebounds','PostAssists','PostSteals','PostBlocks','PostTurnovers','PostPF','PostfgAttempted','PostfgMade','PostftAttempted','PostftMade','PostthreeAttempted','PostthreeMade','PostDQ'])
merged_df2 = merged_df2.drop(columns=['arena'])

merged_df2['birthDate'] = pd.to_datetime(merged_df['birthDate'], errors='coerce').dt.year
merged_df2 = merged_df2.rename(columns={'birthDate': 'birthYear'})

if not os.path.exists('cleanDatasets'):
    os.makedirs('cleanDatasets')

merged_df2.to_csv('cleanDatasets/advancedstatistics.csv', index=False)

In [113]:
players_stats_prevYear = merged_df2[['playerID','year','PER', 'eFG%', 'TS%','stocks','dRebounds']].drop_duplicates().copy()
players_stats_prevYear['year'] = players_stats_prevYear['year'] + 1


players_stats_prevYear = players_stats_prevYear.merge(
    merged_df2[['playerID', 'year', 'tmID', 'playoff']], 
    on=['playerID', 'year'], 
    how='left')

players_stats_prevYear.to_csv('cleanDatasets/players_stats_prevYear.csv', index=False)

players_stats_prevYear.head(15)


Unnamed: 0,playerID,year,PER,eFG%,TS%,stocks,dRebounds,tmID,playoff
0,abrossv01w,3,19.293262,42.150171,48.849265,51,131,MIN,0.0
1,abrossv01w,4,15.898093,40.822785,42.774629,52,101,MIN,1.0
2,abrossv01w,5,18.48596,43.684211,48.457881,55,97,MIN,1.0
3,abrossv01w,6,14.545585,42.446043,45.842753,32,57,MIN,0.0
4,abrossv01w,7,18.086335,45.471014,49.331429,54,78,MIN,0.0
5,abrossv01w,8,15.26977,46.058091,49.015953,35,62,MIN,0.0
6,abrossv01w,9,20.282752,51.845638,54.43014,48,97,CON,1.0
7,abrossv01w,10,11.964576,33.333333,41.182171,8,17,,
8,adamsjo01w,5,18.415523,46.969697,48.701299,5,13,,
9,aguilel01w,4,11.934379,59.090909,59.5898,5,11,,


In [114]:
# Calcula a média de PER, TS%, e eFG% por equipe e ano
team_year_stats = players_stats_prevYear.groupby(['tmID', 'year','playoff'])[['PER', 'TS%', 'eFG%', 'stocks', 'dRebounds']].mean().reset_index()

# Salva o novo dataset em um arquivo CSV
team_year_stats.to_csv('cleanDatasets/team_year_stats.csv', index=False)

# Visualizar os primeiros dados para verificar o resultado
team_year_stats.head(15)


Unnamed: 0,tmID,year,playoff,PER,TS%,eFG%,stocks,dRebounds
0,ATL,9,0.0,14.855192,48.797175,45.746779,21.5,44.416667
1,ATL,10,1.0,17.505053,49.771367,45.945109,41.777778,76.0
2,CHA,2,1.0,16.23616,51.201663,45.908824,32.0,54.111111
3,CHA,3,1.0,13.585956,47.676058,40.657804,26.083333,48.833333
4,CHA,4,1.0,16.781,51.888158,46.154392,34.333333,62.666667
5,CHA,5,0.0,15.772778,49.524802,45.034801,38.375,60.875
6,CHA,6,0.0,14.139178,45.994844,40.331706,35.384615,61.846154
7,CHA,7,0.0,14.72958,46.097397,40.573712,33.444444,55.555556
8,CHI,7,0.0,13.484013,47.462532,44.335021,25.545455,41.454545
9,CHI,8,0.0,13.057702,45.50821,40.08393,29.5,55.5


In [115]:
#Dividir o dataset para treino, validacao e teste 

dataset_treino = team_year_stats[(team_year_stats['year'] >= 1) & (team_year_stats['year'] <= 7)]
dataset_validacao = team_year_stats[team_year_stats['year'].isin([8, 9])]
dataset_teste = team_year_stats[team_year_stats['year'] == 10]

dataset_teste.head(100)

Unnamed: 0,tmID,year,playoff,PER,TS%,eFG%,stocks,dRebounds
1,ATL,10,1.0,17.505053,49.771367,45.945109,41.777778,76.0
11,CHI,10,0.0,14.516353,48.523196,44.406606,40.3,77.6
21,CON,10,0.0,18.172924,52.193001,47.665868,33.3,81.5
30,DET,10,1.0,17.456611,50.714573,44.487758,35.2,73.8
47,IND,10,1.0,14.525819,45.217894,40.307279,35.857143,61.357143
56,LAS,10,1.0,19.077357,50.227263,45.67793,58.0,117.777778
67,MIN,10,0.0,20.033617,52.258593,47.32191,38.8,67.8
76,NYL,10,0.0,16.864376,51.725,47.477969,36.8,71.4
87,PHO,10,1.0,17.393405,49.871563,45.483028,33.2,68.6
98,SAC,10,0.0,15.981555,52.039114,46.109392,36.909091,62.0


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Features (X) e alvo (y)
X_treino = dataset_treino[['PER', 'TS%', 'eFG%','stocks', 'dRebounds']]
y_treino = dataset_treino['playoff']

X_validacao = dataset_validacao[['PER', 'TS%', 'eFG%','stocks', 'dRebounds']]
y_validacao = dataset_validacao['playoff']

X_teste = dataset_teste[['PER', 'TS%', 'eFG%','stocks', 'dRebounds']]

# Inicializar o modelo de regressão
modelo = RandomForestRegressor(random_state=42)

# Treinar com os dados de treino
modelo.fit(X_treino, y_treino)

# Avaliar o modelo com métricas de regressão
mse = mean_squared_error(y_validacao, y_pred_validacao)
r2 = r2_score(y_validacao, y_pred_validacao)

print("Mean Squared Error (MSE) na validação:", mse)
print("R² score na validação:", r2)

# Prever os valores de y no conjunto de teste
y_pred_teste = modelo.predict(X_teste)

# Adicionar a coluna prevista ao dataset_teste
dataset_teste['playoff_previsto'] = y_pred_teste

# Exibir as primeiras linhas para verificar
dataset_teste.head())


Mean Squared Error (MSE) na validação: 0.19112592592592592
R² score na validação: 0.2083477272727271
   tmID  year  playoff        PER        TS%       eFG%     stocks  dRebounds  \
1   ATL    10      1.0  17.505053  49.771367  45.945109  41.777778  76.000000   
11  CHI    10      0.0  14.516353  48.523196  44.406606  40.300000  77.600000   
21  CON    10      0.0  18.172924  52.193001  47.665868  33.300000  81.500000   
30  DET    10      1.0  17.456611  50.714573  44.487758  35.200000  73.800000   
47  IND    10      1.0  14.525819  45.217894  40.307279  35.857143  61.357143   

    playoff_previsto  
1               0.99  
11              0.94  
21              0.79  
30              0.74  
47              0.32  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_teste['playoff_previsto'] = y_pred_teste
