In [1]:
import pandas as pd

# Load the CSV files into DataFrames
nba_17_18 = pd.read_csv("nba17-18.csv")
nba_18_19 = pd.read_csv("nba18-19.csv")
nba_19_20 = pd.read_csv("nba19-20.csv")
nba_20_21 = pd.read_csv("nba20-21.csv")
nba_21_22 = pd.read_csv("nba21-22.csv")
nba_22_23 = pd.read_csv("nba22-23.csv")
nba_23_24 = pd.read_csv("nba23-24.csv")

def clean_data(df):
    # Drop the first, third, and last two columns
    df = df.drop(df.columns[[0, 2, -1, -2]], axis=1)
    return df

# Clean each dataset
nba_17_18 = clean_data(nba_17_18)
nba_18_19 = clean_data(nba_18_19)
nba_19_20 = clean_data(nba_19_20)
nba_20_21 = clean_data(nba_20_21)
nba_21_22 = clean_data(nba_21_22)
nba_22_23 = clean_data(nba_22_23)
nba_23_24 = clean_data(nba_23_24)

# Function to filter rows where "Team" is '2TM' for each player
def filter_tm(df):
    # For each player, keep the row where "Team" is '2TM' or '3TM' if available, else keep the first row
    def get_tm_or_first(group):
        if "3TM" in group["Team"].values:
            return group[group["Team"] == "3TM"]
        elif "2TM" in group["Team"].values:
            return group[group["Team"] == "2TM"]
        else:
            return group.head(1)
    
    return df.groupby("Player", group_keys=False).apply(get_tm_or_first)

# Apply the filtering function to each dataset
nba_17_18 = filter_tm(nba_17_18)
nba_18_19 = filter_tm(nba_18_19)
nba_19_20 = filter_tm(nba_19_20)
nba_20_21 = filter_tm(nba_20_21)
nba_21_22 = filter_tm(nba_21_22)
nba_22_23 = filter_tm(nba_22_23)
nba_23_24 = filter_tm(nba_23_24)

# Add season identifiers to the column names (excluding the first column)
def add_season_prefix(df, season):
    df.columns = [df.columns[0]] + [f"{season}.{col}" for col in df.columns[1:]]
    return df

nba_17_18 = add_season_prefix(nba_17_18, "17-18")
nba_18_19 = add_season_prefix(nba_18_19, "18-19")
nba_19_20 = add_season_prefix(nba_19_20, "19-20")
nba_20_21 = add_season_prefix(nba_20_21, "20-21")
nba_21_22 = add_season_prefix(nba_21_22, "21-22")
nba_22_23 = add_season_prefix(nba_22_23, "22-23")
nba_23_24 = add_season_prefix(nba_23_24, "23-24")




In [2]:
nba_master_data = nba_17_18
for df in [nba_18_19, nba_19_20, nba_20_21, nba_21_22, nba_22_23, nba_23_24]:
    nba_master_data = pd.merge(nba_master_data, df, on="Player", how="outer")

In [10]:
nba_master_data[nba_master_data['Player']=='Obi Toppin']

Unnamed: 0,Player,17-18.Team,17-18.Pos,17-18.G,17-18.GS,17-18.MP,17-18.FG,17-18.FGA,17-18.FG%,17-18.3P,...,23-24.FT%,23-24.ORB,23-24.DRB,23-24.TRB,23-24.AST,23-24.STL,23-24.BLK,23-24.TOV,23-24.PF,23-24.PTS
847,Obi Toppin,,,,,,,,,,...,0.77,1.0,2.9,3.9,1.6,0.6,0.5,0.8,1.7,10.3


In [14]:
# filtering to first 3 nba seasons

# Extract all season columns (assumes format like "17-18.FG%", "23-24.TRB")
season_cols = [col for col in nba_master_data.columns if any(char.isdigit() for char in col)]

# Extract seasons from column names
seasons = sorted(set(col.split('.')[0] for col in season_cols))

# Function to determine first season played
def first_season(row):
    for season in seasons:
        if not pd.isna(row[f"{season}.Team"]):
            return season
    return None

# Find each player's first season
nba_master_data["First_Season"] = nba_master_data.apply(first_season, axis=1)

# Function to get first 3 seasons' stats
def extract_first_three_seasons(row):
    first_season = row["First_Season"]
    if first_season is None:
        return {}

    start_index = seasons.index(first_season)
    season_map = {seasons[start_index]: "yr1", 
                  seasons[start_index + 1] if start_index + 1 < len(seasons) else None: "yr2",
                  seasons[start_index + 2] if start_index + 2 < len(seasons) else None: "yr3"}
    
    new_row = {}
    for old_season, new_prefix in season_map.items():
        if old_season is None:
            continue
        for col in season_cols:
            if col.startswith(old_season):
                stat = col.split('.', 1)[1]
                new_row[f"{new_prefix}.{stat}"] = row[col]
    
    return new_row

In [15]:
# Apply transformation
nba_master_data_new = nba_master_data.apply(lambda row: pd.Series(extract_first_three_seasons(row)), axis=1)

# Keep only necessary columns
df_final = pd.concat([df[['Player']], nba_master_data_new], axis=1)

# Save to CSV
df_final.to_csv("nba_first_3_szns.csv", index=False)

print(df_final.head())

            Player  yr1.2P  yr1.2P%  yr1.2PA  yr1.3P  yr1.3P%  yr1.3PA  \
472     A.J. Green     0.5    0.612      0.8     0.7    0.333      2.0   
583    A.J. Lawson     1.4    0.504      2.8     0.1    0.167      0.5   
626     AJ Griffin     2.2    0.553      3.9     0.8    0.325      2.5   
122   Aaron Gordon     0.1    0.125      0.5     0.4    0.412      1.1   
339  Aaron Holiday     0.3    0.714      0.5     0.5    0.381      1.4   

     yr1.AST  yr1.BLK  yr1.DRB  ...  yr3.MP  yr3.ORB  yr3.PF  yr3.PTS  \
472      0.3      0.2      0.5  ...    22.4      0.4     2.8      3.9   
583      0.3      0.2      1.2  ...    23.6      1.1     2.6     11.6   
626      1.2      0.6      3.2  ...    29.3      1.2     2.2     12.7   
122      1.0      0.2      0.9  ...     9.5      0.1     0.8      2.0   
339      0.2      0.1      0.7  ...     5.8      0.1     0.6      1.3   

     yr3.Pos  yr3.STL  yr3.TOV  yr3.TRB  yr3.Team  yr3.eFG%  
472       SF      0.5      0.6      1.3       OKC     