In [2]:
import pandas as pd
import numpy as np



df = pd.read_csv("AllStar_Dataset.csv")



#print(df.head)

In [3]:
print(df.describe())
print(df.isnull().sum())
print(df.dtypes)

       games_played         mp        ppg        rpg        apg        spg  \
count     24.000000  24.000000  24.000000  24.000000  24.000000  24.000000   
mean      70.708333  34.591667  25.329167   6.633333   6.091667   1.200000   
std        6.937480   1.942861   3.486555   2.736813   2.224550   0.320326   
min       54.000000  30.000000  19.300000   2.800000   3.000000   0.700000   
25%       68.750000  33.950000  22.900000   4.500000   4.650000   0.975000   
50%       72.000000  35.100000  25.800000   5.800000   6.100000   1.200000   
75%       75.000000  35.550000  26.950000   8.150000   6.925000   1.325000   
max       80.000000  37.500000  33.900000  12.600000  10.900000   2.000000   

             bpg         to        fg%        3p%        2p%        ft%  \
count  24.000000  24.000000  24.000000  24.000000  24.000000  24.000000   
mean    0.716667   2.750000   0.494750   0.366958   0.544000   0.828292   
std     0.465941   0.656738   0.047108   0.039919   0.047424   0.069280 

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = ['ppg', 'rpg', 'apg', 'spg', 'bpg', 'to', 'fg%', 'ft%']
df[features] = scaler.fit_transform(df[features])


In [5]:

# Assuming your DataFrame has the necessary columns, calculate the normalized columns and CEM
for column in ['ppg', 'rpg', 'apg', 'spg', 'bpg', 'to', 'fg%', 'ft%']:
    df[f'norm_{column}'] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

# Calculate CEM
df['CEM'] = (0.35 * df['norm_ppg'] + 0.20 * df['norm_rpg'] + 0.20 * df['norm_apg'] +
             0.05 * df['norm_spg'] + 0.05 * df['norm_bpg'] - 0.10 * df['norm_to'] +
             0.10 * df['norm_fg%'] + 0.05 * df['norm_ft%'])

# Adjust CEM by games played
df['Final_CEM'] = df['CEM'] * np.log(df['games_played'])

# View the result
print(df[['player_name', 'Final_CEM']].sort_values(by='Final_CEM', ascending=False))


                player_name  Final_CEM
14             Nikola Jokic   2.746487
15              Luka Doncic   2.705433
2     Giannis Antetekounmpo   2.616970
12  Shai-Gilgeous Alexander   2.345755
19            Anthony Davis   2.152382
16             LeBron James   1.801004
3              Jayson Tatum   1.645113
13             Kevin Durant   1.630974
20             Devin Booker   1.598957
8             Jalen Brunson   1.588080
10         Donovan Mitchell   1.483675
9              Tyrese Maxey   1.458023
22            Kawhi Leonard   1.402828
1         Tyrese Haliburton   1.338404
21          Anthony Edwards   1.294430
11               Trae Young   1.254123
18            Stephen Curry   1.230915
0            Damian Lillard   1.114866
7            Scottie Barnes   1.034849
23              Paul George   1.015634
4               Bam Adebayo   0.967707
6            Paolo Banchero   0.961391
17       Karl-Anthony Towns   0.927050
5              Jaylen Brown   0.884502


In [6]:
# Normalizing all features first to ensure they contribute equally
features = ['ppg', 'rpg', 'apg', 'spg', 'bpg', 'to', 'fg%', '3p%', '2p%', 'ft%']
for feature in features:
    df[f'norm_{feature}'] = (df[feature] - df[feature].min()) / (df[feature].max() - df[feature].min())

# Creating a PER-like statistic
df['PER_like'] = (0.35 * df['norm_ppg'] + 0.20 * df['norm_rpg'] + 0.20 * df['norm_apg'] +
                  0.05 * df['norm_spg'] + 0.05 * df['norm_bpg'] + 0.10 * df['norm_fg%'] +
                  0.07 * df['norm_3p%'] + 0.05 * df['norm_2p%'] + 0.05 * df['norm_ft%'] -
                  0.10 * df['norm_to'])

# Adjust PER-like statistic by games played, using a logarithmic adjustment
df['MVPCEM'] = df['PER_like'] * np.log(df['games_played'])

# Identifying the MVP based on the highest'MVPCEM
mvp = df.sort_values(by='MVPCEM', ascending=False).iloc[0]
print(f"The MVP based on the MVPCEM statistic is {mvp['player_name']} with a score of {mvp['MVPCEM']:.2f}")




The MVP based on the MVPCEM statistic is Nikola Jokic with a score of 3.10


In [7]:
# Sort players based on Adjusted_CEM
df_sorted = df.sort_values(by='Final_CEM', ascending=False)

# Define categories
categories = ['MVP Caliber', 'All-NBA Caliber', 'All-Star Caliber']
df_sorted['Category'] = pd.qcut(df_sorted['Final_CEM'], 3, labels=categories[::-1])

# Check the classification
print(df_sorted[['player_name', 'Category']])


                player_name          Category
14             Nikola Jokic       MVP Caliber
15              Luka Doncic       MVP Caliber
2     Giannis Antetekounmpo       MVP Caliber
12  Shai-Gilgeous Alexander       MVP Caliber
19            Anthony Davis       MVP Caliber
16             LeBron James       MVP Caliber
3              Jayson Tatum       MVP Caliber
13             Kevin Durant       MVP Caliber
20             Devin Booker   All-NBA Caliber
8             Jalen Brunson   All-NBA Caliber
10         Donovan Mitchell   All-NBA Caliber
9              Tyrese Maxey   All-NBA Caliber
22            Kawhi Leonard   All-NBA Caliber
1         Tyrese Haliburton   All-NBA Caliber
21          Anthony Edwards   All-NBA Caliber
11               Trae Young   All-NBA Caliber
18            Stephen Curry  All-Star Caliber
0            Damian Lillard  All-Star Caliber
7            Scottie Barnes  All-Star Caliber
23              Paul George  All-Star Caliber
4               Bam Adebayo  All-S

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Ensure the DataFrame 'df' is correctly set with 'conference' and 'Category' columns
print(df.head())  # Check the first few rows to confirm the DataFrame structure
print(df.info())

             player_name conference  games_played    mp       ppg       rpg  \
0         Damian Lillard       EAST            73  35.3 -0.565217 -0.833585   
1      Tyrese Haliburton       EAST            69  32.2 -1.532066 -1.020209   
2  Giannis Antetekounmpo       EAST            73  35.2  1.485677  1.816470   
3           Jayson Tatum       EAST            74  35.7  0.460230  0.547429   
4            Bam Adebayo       EAST            71  30.0 -1.766454  0.771378   

        apg       spg       bpg        to  ...  norm_bpg   norm_to  norm_fg%  \
0  0.417104 -0.637793 -1.132715 -0.233314  ...  0.000000  0.333333  0.000000   
1  2.207975  0.000000 -0.036539 -0.699942  ...  0.238095  0.222222  0.283422   
2  0.187506  0.000000  0.840402  1.011028  ...  0.428571  0.629630  1.000000   
3 -0.547210 -0.637793 -0.255774 -0.388857  ...  0.190476  0.296296  0.192513   
4 -1.006408 -0.318896  0.401931 -0.855485  ...  0.333333  0.185185  0.518717   

   norm_ft%       CEM  Final_CEM  norm_3p%  

In [9]:
east_players = df_sorted[df_sorted['conference'] == 'EAST']
display(east_players)

Unnamed: 0,player_name,conference,games_played,mp,ppg,rpg,apg,spg,bpg,to,...,norm_to,norm_fg%,norm_ft%,CEM,Final_CEM,norm_3p%,norm_2p%,PER_like,MVPCEM,Category
2,Giannis Antetekounmpo,EAST,73,35.2,1.485677,1.81647,0.187506,0.0,0.840402,1.011028,...,0.62963,1.0,0.0,0.609951,2.61697,0.020548,1.0,0.661389,2.837664,MVP Caliber
3,Jayson Tatum,EAST,74,35.7,0.46023,0.547429,-0.54721,-0.637793,-0.255774,-0.388857,...,0.296296,0.192513,0.661654,0.382223,1.645113,0.719178,0.328283,0.44898,1.932437,MVP Caliber
8,Jalen Brunson,EAST,77,35.4,0.987603,-1.132183,0.279345,-0.956689,-1.132715,-0.5444,...,0.259259,0.139037,0.691729,0.365596,1.58808,0.890411,0.19697,0.437774,1.901604,All-NBA Caliber
10,Donovan Mitchell,EAST,55,35.3,0.372335,-0.572312,0.003827,1.913378,-0.47501,0.077771,...,0.407407,0.203209,0.815789,0.37024,1.483675,0.664384,0.313131,0.432403,1.732784,All-NBA Caliber
9,Tyrese Maxey,EAST,70,37.5,0.167246,-1.094859,0.049746,-0.637793,-0.47501,-1.633199,...,0.0,0.278075,0.781955,0.343186,1.458023,0.856164,0.121212,0.409178,1.73839,All-NBA Caliber
1,Tyrese Haliburton,EAST,69,32.2,-1.532066,-1.020209,2.207975,0.0,-0.036539,-0.699942,...,0.222222,0.283422,0.744361,0.316101,1.338404,0.636986,0.0,0.36069,1.527199,All-NBA Caliber
11,Trae Young,EAST,54,36.0,0.108649,-1.430781,2.162055,0.318896,-1.132715,2.566455,...,1.0,0.032086,0.744361,0.314397,1.254123,0.69863,0.010101,0.363806,1.451215,All-NBA Caliber
0,Damian Lillard,EAST,73,35.3,-0.565217,-0.833585,0.417104,-0.637793,-1.132715,-0.233314,...,0.333333,0.0,0.988722,0.259848,1.114866,0.568493,0.065657,0.302925,1.299688,All-Star Caliber
7,Scottie Barnes,EAST,60,35.0,-1.590663,0.286156,0.003827,0.318896,1.717343,0.077771,...,0.407407,0.294118,0.466165,0.252751,1.034849,0.479452,0.29798,0.301211,1.233263,All-Star Caliber
4,Bam Adebayo,EAST,71,30.0,-1.766454,0.771378,-1.006408,-0.318896,0.401931,-0.855485,...,0.185185,0.518717,0.368421,0.227018,0.967707,0.589041,0.388889,0.287696,1.226355,All-Star Caliber


In [13]:
# Assuming 'df' is your original DataFrame and it has a 'Category' column
east_playerss = df_sorted[df_sorted['conference'] == 'EAST']

# Verify the new dataset
print(east_playerss.head())  # This will show the first few rows of the new DataFrame
print(f"Number of MVP Caliber players: {east_playerss.shape[0]}")  # Shows how many players are in this category

              player_name conference  games_played    mp       ppg       rpg  \
2   Giannis Antetekounmpo       EAST            73  35.2  1.485677  1.816470   
3            Jayson Tatum       EAST            74  35.7  0.460230  0.547429   
8           Jalen Brunson       EAST            77  35.4  0.987603 -1.132183   
10       Donovan Mitchell       EAST            55  35.3  0.372335 -0.572312   
9            Tyrese Maxey       EAST            70  37.5  0.167246 -1.094859   

         apg       spg       bpg        to  ...   norm_to  norm_fg%  norm_ft%  \
2   0.187506  0.000000  0.840402  1.011028  ...  0.629630  1.000000  0.000000   
3  -0.547210 -0.637793 -0.255774 -0.388857  ...  0.296296  0.192513  0.661654   
8   0.279345 -0.956689 -1.132715 -0.544400  ...  0.259259  0.139037  0.691729   
10  0.003827  1.913378 -0.475010  0.077771  ...  0.407407  0.203209  0.815789   
9   0.049746 -0.637793 -0.475010 -1.633199  ...  0.000000  0.278075  0.781955   

         CEM  Final_CEM  norm_3p

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Splitting the dataset by conference
east_players = df[df['conference'] == 'EAST']
west_players = df[df['conference'] == 'WEST']

# Assume 'features' are defined as your input variables and 'target' as your output variable
features = ['ppg', 'rpg', 'apg', 'spg', 'bpg']

df['Salary_Class'] = (df['Salary'] > df['Salary'].median()).astype(int)
target = 'Salary_Class'




# Training a model for East players
X_east = east_players[features]
y_east = east_players[target]
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_east, y_east, test_size=0.2, random_state=42)
model_east = LogisticRegression()
model_east.fit(X_train_e, y_train_e)

# Evaluate East model
predictions_east = model_east.predict(X_test_e)
print("East Conference Model Evaluation:")
print(classification_report(y_test_e, predictions_east))



East Conference Model Evaluation:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:

east_players = df[df['conference'] == 'EAST'].copy()
west_players = df[df['conference'] == 'WEST'].copy()


east_players['Salary_Class'] = (east_players['Salary'] > east_players['Salary'].median()).astype(int)
west_players['Salary_Class'] = (west_players['Salary'] > west_players['Salary'].median()).astype(int)

features = ['ppg', 'rpg', 'apg', 'spg', 'bpg']
target = 'Salary_Class'


In [22]:
X_west = west_players[features]
y_west = west_players[target]

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(
    X_west, y_west, test_size=0.2, random_state=42, stratify=y_west
)
model_west = LogisticRegression()
model_west.fit(X_train_w, y_train_w)

# Evaluate West model
predictions_west = model_west.predict(X_test_w)
print("West Conference Model Evaluation:")
print(classification_report(y_test_w, predictions_west))

West Conference Model Evaluation:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0

