In [1]:
# import libraries
import pandas as pd
import numpy as np


# Import Data

In [2]:
# import dataset
path = "/content/NBA_raw_player_data.csv"
data = pd.read_csv(path)

df = pd.DataFrame(data)
df

Unnamed: 0.1,Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,0,1951-12-25,Bullets,,Don Barksdale,placed on IL
1,1,1952-12-26,Knicks,,Max Zaslofsky,placed on IL with torn side muscle
2,2,1956-12-29,Knicks,,Jim Baechtold,placed on inactive list
3,3,1959-01-16,Lakers,,Elgin Baylor,player refused to play after being denied a ro...
4,4,1961-11-26,Lakers,,Elgin Baylor,player reported for military duty
...,...,...,...,...,...,...
37662,37662,2023-04-16,Clippers,Marcus Morris,,activated from IL
37663,37663,2023-04-16,Grizzlies,Dillon Brooks,,activated from IL
37664,37664,2023-04-16,Grizzlies,Ja Morant,,activated from IL
37665,37665,2023-04-16,Grizzlies,Jaren Jackson Jr.,,activated from IL


# Clean data

In [3]:
# drop columns
new_df = df.drop(columns=['Unnamed: 0'])
new_df

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,1951-12-25,Bullets,,Don Barksdale,placed on IL
1,1952-12-26,Knicks,,Max Zaslofsky,placed on IL with torn side muscle
2,1956-12-29,Knicks,,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,,Elgin Baylor,player reported for military duty
...,...,...,...,...,...
37662,2023-04-16,Clippers,Marcus Morris,,activated from IL
37663,2023-04-16,Grizzlies,Dillon Brooks,,activated from IL
37664,2023-04-16,Grizzlies,Ja Morant,,activated from IL
37665,2023-04-16,Grizzlies,Jaren Jackson Jr.,,activated from IL


In [4]:
# find where acquired and relinquished are the same
same_values = new_df[new_df['Acquired'].notna() & new_df['Relinquished'].notna()]
same_values

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
14793,2010-01-16,Jazz,Kyrylo Fesenko,Kyrylo Fesenko,activated from IL


In [5]:
# rename acquired and relinquished to player name for readability
new_df['player_name'] = new_df[["Acquired", "Relinquished"]].fillna('').sum(axis=1)

# drop acquired and relinquished
new_df = new_df.drop(columns=["Acquired",'Relinquished'])
new_df

Unnamed: 0,Date,Team,Notes,player_name
0,1951-12-25,Bullets,placed on IL,Don Barksdale
1,1952-12-26,Knicks,placed on IL with torn side muscle,Max Zaslofsky
2,1956-12-29,Knicks,placed on inactive list,Jim Baechtold
3,1959-01-16,Lakers,player refused to play after being denied a ro...,Elgin Baylor
4,1961-11-26,Lakers,player reported for military duty,Elgin Baylor
...,...,...,...,...
37662,2023-04-16,Clippers,activated from IL,Marcus Morris
37663,2023-04-16,Grizzlies,activated from IL,Dillon Brooks
37664,2023-04-16,Grizzlies,activated from IL,Ja Morant
37665,2023-04-16,Grizzlies,activated from IL,Jaren Jackson Jr.


In [6]:
# search for nulls
new_df.isnull().sum()

Unnamed: 0,0
Date,0
Team,0
Notes,0
player_name,0


In [7]:
# rename columns
new_df = new_df.rename(columns={'Date': 'date', 'Team': 'team', 'Notes': 'notes'})
new_df

Unnamed: 0,date,team,notes,player_name
0,1951-12-25,Bullets,placed on IL,Don Barksdale
1,1952-12-26,Knicks,placed on IL with torn side muscle,Max Zaslofsky
2,1956-12-29,Knicks,placed on inactive list,Jim Baechtold
3,1959-01-16,Lakers,player refused to play after being denied a ro...,Elgin Baylor
4,1961-11-26,Lakers,player reported for military duty,Elgin Baylor
...,...,...,...,...
37662,2023-04-16,Clippers,activated from IL,Marcus Morris
37663,2023-04-16,Grizzlies,activated from IL,Dillon Brooks
37664,2023-04-16,Grizzlies,activated from IL,Ja Morant
37665,2023-04-16,Grizzlies,activated from IL,Jaren Jackson Jr.


In [8]:
# change to datetime
new_df['date'] = pd.to_datetime(df.Date)
new_df

Unnamed: 0,date,team,notes,player_name
0,1951-12-25,Bullets,placed on IL,Don Barksdale
1,1952-12-26,Knicks,placed on IL with torn side muscle,Max Zaslofsky
2,1956-12-29,Knicks,placed on inactive list,Jim Baechtold
3,1959-01-16,Lakers,player refused to play after being denied a ro...,Elgin Baylor
4,1961-11-26,Lakers,player reported for military duty,Elgin Baylor
...,...,...,...,...
37662,2023-04-16,Clippers,activated from IL,Marcus Morris
37663,2023-04-16,Grizzlies,activated from IL,Dillon Brooks
37664,2023-04-16,Grizzlies,activated from IL,Ja Morant
37665,2023-04-16,Grizzlies,activated from IL,Jaren Jackson Jr.


# Preprocessing

In [9]:
# clean notes
new_df['notes_clean'] = new_df['notes'].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True).str.strip()
new_df


Unnamed: 0,date,team,notes,player_name,notes_clean
0,1951-12-25,Bullets,placed on IL,Don Barksdale,placed on il
1,1952-12-26,Knicks,placed on IL with torn side muscle,Max Zaslofsky,placed on il with torn side muscle
2,1956-12-29,Knicks,placed on inactive list,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,player refused to play after being denied a ro...,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,player reported for military duty,Elgin Baylor,player reported for military duty
...,...,...,...,...,...
37662,2023-04-16,Clippers,activated from IL,Marcus Morris,activated from il
37663,2023-04-16,Grizzlies,activated from IL,Dillon Brooks,activated from il
37664,2023-04-16,Grizzlies,activated from IL,Ja Morant,activated from il
37665,2023-04-16,Grizzlies,activated from IL,Jaren Jackson Jr.,activated from il


In [10]:
new_df = new_df.drop(columns='notes')
new_df

Unnamed: 0,date,team,player_name,notes_clean
0,1951-12-25,Bullets,Don Barksdale,placed on il
1,1952-12-26,Knicks,Max Zaslofsky,placed on il with torn side muscle
2,1956-12-29,Knicks,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,Elgin Baylor,player reported for military duty
...,...,...,...,...
37662,2023-04-16,Clippers,Marcus Morris,activated from il
37663,2023-04-16,Grizzlies,Dillon Brooks,activated from il
37664,2023-04-16,Grizzlies,Ja Morant,activated from il
37665,2023-04-16,Grizzlies,Jaren Jackson Jr.,activated from il


In [11]:
# clean names
# fix slashes for more than one name
new_df['player_name'] = new_df['player_name'].str.split('/').str[-1].str.strip()
new_df

Unnamed: 0,date,team,player_name,notes_clean
0,1951-12-25,Bullets,Don Barksdale,placed on il
1,1952-12-26,Knicks,Max Zaslofsky,placed on il with torn side muscle
2,1956-12-29,Knicks,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,Elgin Baylor,player reported for military duty
...,...,...,...,...
37662,2023-04-16,Clippers,Marcus Morris,activated from il
37663,2023-04-16,Grizzlies,Dillon Brooks,activated from il
37664,2023-04-16,Grizzlies,Ja Morant,activated from il
37665,2023-04-16,Grizzlies,Jaren Jackson Jr.,activated from il


In [12]:
# normalize activated and returned to be uniform
# looking for returned in the notes
returned = new_df[new_df['notes_clean'].str.contains('returned', case=False, na=False)]
returned.value_counts().count()



np.int64(3)

In [13]:
# change returned to activated
new_df['notes_clean'] = new_df['notes_clean'].str.replace('returned to lineup', 'activated', case=False, regex=True)
new_df

Unnamed: 0,date,team,player_name,notes_clean
0,1951-12-25,Bullets,Don Barksdale,placed on il
1,1952-12-26,Knicks,Max Zaslofsky,placed on il with torn side muscle
2,1956-12-29,Knicks,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,Elgin Baylor,player reported for military duty
...,...,...,...,...
37662,2023-04-16,Clippers,Marcus Morris,activated from il
37663,2023-04-16,Grizzlies,Dillon Brooks,activated from il
37664,2023-04-16,Grizzlies,Ja Morant,activated from il
37665,2023-04-16,Grizzlies,Jaren Jackson Jr.,activated from il


In [14]:
# change to activated to be the same
new_df['notes_clean'] = new_df['notes_clean'].str.replace('player returned to team', 'activated', case=False, regex=True)
new_df

Unnamed: 0,date,team,player_name,notes_clean
0,1951-12-25,Bullets,Don Barksdale,placed on il
1,1952-12-26,Knicks,Max Zaslofsky,placed on il with torn side muscle
2,1956-12-29,Knicks,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,Elgin Baylor,player reported for military duty
...,...,...,...,...
37662,2023-04-16,Clippers,Marcus Morris,activated from il
37663,2023-04-16,Grizzlies,Dillon Brooks,activated from il
37664,2023-04-16,Grizzlies,Ja Morant,activated from il
37665,2023-04-16,Grizzlies,Jaren Jackson Jr.,activated from il


In [15]:
activated = new_df[new_df['notes_clean'].str.contains('activated', case=False, na=False)]
activated = activated.rename(columns={'date':'activated'})
activated = activated.drop(columns=['notes_clean'])

activated.count()

Unnamed: 0,0
activated,17615
team,17615
player_name,17615


In [17]:
# get injury notes only
new_df_filtered = new_df[~new_df['notes_clean'].str.contains('activated', case=False, na=False)]

new_df_filtered = new_df_filtered.rename(columns={'date':'injured'})
new_df_filtered.value_counts()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
injured,team,player_name,notes_clean,Unnamed: 4_level_1
2019-02-25,Warriors,Jacob Evans,placed on il,2
2017-02-13,Spurs,Bryn Forbes,placed on il,1
2017-02-13,Pacers,Rakeem Christmas,placed on il,1
2017-02-13,Celtics,Jaylen Brown,placed on il with strained right hip flexor,1
2017-02-13,Celtics,Chris Wilcox,placed on il with left achilles tendinitis,1
...,...,...,...,...
2008-04-01,Bulls,Drew Gooden,placed on il with abdominal strain,1
2008-04-01,Bucks,Jake Voskuhl,placed on il with sprained left ankle,1
2008-03-31,Pacers,David Harrison,placed on il with illness,1
2008-03-31,Jazz,Ronnie Brewer,placed on il with strained right groin,1


In [18]:
# rename columns
new_df = new_df.rename(columns={'date':'date_injured'})
new_df.sort_values(by='player_name')

Unnamed: 0,date_injured,team,player_name,notes_clean
29351,2019-03-16,Nets,,activated from il
19200,2013-03-01,Hawks,(James) Mike Scott,placed on il
20907,2014-11-12,Hawks,(James) Mike Scott,placed on il with back injury
27582,2018-03-21,Wizards,(James) Mike Scott,placed on il with flu
29561,2019-04-09,76ers,(James) Mike Scott,placed on il with tightness in lower back
...,...,...,...,...
35740,2022-04-06,Warriors,placed on IL with right Achilles tendon injury,activated from il
37182,2023-03-04,Jazz,placed on IL with sprained right thumb,activated from il
35742,2022-04-07,Bucks,placed on IL with surgery on right knee,activated from il
32352,2021-04-02,Pistons,placed on IL with torn labrum in right hip,activated from il


In [22]:
merged_df = activated.merge(new_df_filtered, on='player_name', how='inner')
merged_df


Unnamed: 0,activated,team_x,player_name,injured,team_y,notes_clean
0,1962-11-06,Zephyrs,Al Ferrari,1962-10-25,Zephyrs,placed on disabled list
1,1962-11-06,Zephyrs,Al Ferrari,1962-11-14,Zephyrs,placed on disabled list with knee injury
2,1969-10-28,Bucks,Bob Greacen,1969-10-15,Bucks,placed on il with sprained ankle
3,1972-11-28,Tams (ABA),Merv Jackson,1972-11-10,Tams (ABA),placed on il
4,1972-12-16,Tams (ABA),Randy Denton,1972-11-11,Tams (ABA),placed on il
...,...,...,...,...,...,...
270651,2023-04-16,Grizzlies,Santi Aldama,2022-02-07,Grizzlies,placed on il with sore right foot
270652,2023-04-16,Grizzlies,Santi Aldama,2022-04-09,Grizzlies,placed on il
270653,2023-04-16,Grizzlies,Santi Aldama,2022-04-19,Grizzlies,placed on il with sore right knee out for season
270654,2023-04-16,Grizzlies,Santi Aldama,2022-11-25,Grizzlies,placed on il with illness


In [24]:
# create copy
sort_df = merged_df

In [25]:
sort_df

Unnamed: 0,activated,team_x,player_name,injured,team_y,notes_clean
0,1962-11-06,Zephyrs,Al Ferrari,1962-10-25,Zephyrs,placed on disabled list
1,1962-11-06,Zephyrs,Al Ferrari,1962-11-14,Zephyrs,placed on disabled list with knee injury
2,1969-10-28,Bucks,Bob Greacen,1969-10-15,Bucks,placed on il with sprained ankle
3,1972-11-28,Tams (ABA),Merv Jackson,1972-11-10,Tams (ABA),placed on il
4,1972-12-16,Tams (ABA),Randy Denton,1972-11-11,Tams (ABA),placed on il
...,...,...,...,...,...,...
270651,2023-04-16,Grizzlies,Santi Aldama,2022-02-07,Grizzlies,placed on il with sore right foot
270652,2023-04-16,Grizzlies,Santi Aldama,2022-04-09,Grizzlies,placed on il
270653,2023-04-16,Grizzlies,Santi Aldama,2022-04-19,Grizzlies,placed on il with sore right knee out for season
270654,2023-04-16,Grizzlies,Santi Aldama,2022-11-25,Grizzlies,placed on il with illness


In [26]:
# find closest activaction for potential season ending injuries
# Ensure dates are in datetime format
sort_df['activated'] = pd.to_datetime(sort_df['activated'], errors='coerce')
sort_df['injured'] = pd.to_datetime(sort_df['injured'], errors='coerce')

# filter rows where 'activated' exists and is on or after 'injured' date, for each player
valid_activations = sort_df[
    sort_df['activated'].notna() & (sort_df['activated'] >= sort_df['injured'])
]

# find the closest activation date for each injury by grouping on 'player_name' and 'injured'
closest_activations = (
    valid_activations.groupby(['player_name', 'injured'])['activated']
    .min()
    .reset_index()
    .rename(columns={'activated': 'ClosestActivationDate'})
)

# merge this closest activation date back to the original dataframe
sort_df = sort_df.merge(closest_activations, on=['player_name', 'injured'], how='left')

# handle cases where no activation date exists
sort_df['ClosestActivationDate'] = sort_df['ClosestActivationDate'].fillna('No Activation Date')

sort_df

Unnamed: 0,activated,team_x,player_name,injured,team_y,notes_clean,ClosestActivationDate
0,1962-11-06,Zephyrs,Al Ferrari,1962-10-25,Zephyrs,placed on disabled list,1962-11-06 00:00:00
1,1962-11-06,Zephyrs,Al Ferrari,1962-11-14,Zephyrs,placed on disabled list with knee injury,No Activation Date
2,1969-10-28,Bucks,Bob Greacen,1969-10-15,Bucks,placed on il with sprained ankle,1969-10-28 00:00:00
3,1972-11-28,Tams (ABA),Merv Jackson,1972-11-10,Tams (ABA),placed on il,1972-11-28 00:00:00
4,1972-12-16,Tams (ABA),Randy Denton,1972-11-11,Tams (ABA),placed on il,1972-12-16 00:00:00
...,...,...,...,...,...,...,...
270651,2023-04-16,Grizzlies,Santi Aldama,2022-02-07,Grizzlies,placed on il with sore right foot,2022-02-12 00:00:00
270652,2023-04-16,Grizzlies,Santi Aldama,2022-04-09,Grizzlies,placed on il,2022-04-10 00:00:00
270653,2023-04-16,Grizzlies,Santi Aldama,2022-04-19,Grizzlies,placed on il with sore right knee out for season,2022-11-27 00:00:00
270654,2023-04-16,Grizzlies,Santi Aldama,2022-11-25,Grizzlies,placed on il with illness,2022-11-27 00:00:00


In [27]:
sort_df['ClosestActivationDate'] = pd.to_datetime(sort_df['ClosestActivationDate'], errors='coerce').dt.date


In [28]:
sort_df = sort_df.drop(columns=['activated'])

In [29]:
sort_df = sort_df.rename(columns={'ClosestActivationDate':'activated'})
sort_df = sort_df[['player_name','notes_clean','injured','activated']]
sort_df

Unnamed: 0,player_name,notes_clean,injured,activated
0,Al Ferrari,placed on disabled list,1962-10-25,1962-11-06
1,Al Ferrari,placed on disabled list with knee injury,1962-11-14,NaT
2,Bob Greacen,placed on il with sprained ankle,1969-10-15,1969-10-28
3,Merv Jackson,placed on il,1972-11-10,1972-11-28
4,Randy Denton,placed on il,1972-11-11,1972-12-16
...,...,...,...,...
270651,Santi Aldama,placed on il with sore right foot,2022-02-07,2022-02-12
270652,Santi Aldama,placed on il,2022-04-09,2022-04-10
270653,Santi Aldama,placed on il with sore right knee out for season,2022-04-19,2022-11-27
270654,Santi Aldama,placed on il with illness,2022-11-25,2022-11-27


In [30]:
sort_df['activated'] = pd.to_datetime(sort_df['activated'], errors='coerce')


In [31]:
# get total number of days injured
sort_df['days_injured'] = (sort_df['activated'] - sort_df['injured']).dt.days
sort_df

Unnamed: 0,player_name,notes_clean,injured,activated,days_injured
0,Al Ferrari,placed on disabled list,1962-10-25,1962-11-06,12.0
1,Al Ferrari,placed on disabled list with knee injury,1962-11-14,NaT,
2,Bob Greacen,placed on il with sprained ankle,1969-10-15,1969-10-28,13.0
3,Merv Jackson,placed on il,1972-11-10,1972-11-28,18.0
4,Randy Denton,placed on il,1972-11-11,1972-12-16,35.0
...,...,...,...,...,...
270651,Santi Aldama,placed on il with sore right foot,2022-02-07,2022-02-12,5.0
270652,Santi Aldama,placed on il,2022-04-09,2022-04-10,1.0
270653,Santi Aldama,placed on il with sore right knee out for season,2022-04-19,2022-11-27,222.0
270654,Santi Aldama,placed on il with illness,2022-11-25,2022-11-27,2.0


In [32]:
sort_df.isnull().sum()

Unnamed: 0,0
player_name,0
notes_clean,0
injured,0
activated,5325
days_injured,5325


# Zero-shot classification for injury notes

In [33]:
# create labels for injuries
def classify_injury_type(note):
    note = note.lower()
    if any(word in note for word in ["muscle", "strain", "pulled", "hamstring", "quad"]):
        return "muscle"
    elif any(word in note for word in ["fracture", "broken", "stress fracture"]):
        return "bone"
    elif any(word in note for word in ["knee", "ankle", "shoulder", "sprain", "ligament", "acl", "mcl"]):
        return "joint"
    elif "concussion" in note or "head" in note:
        return "concussion"
    elif any(word in note for word in ["flu", "illness", "virus", "fever", "cold", "infection"]):
        return "illness"
    else:
        return "unspecified"

# create labels for injury duration
def classify_duration(days):
    if pd.isna(days):
        return "unknown"
    elif days <= 7:
        return "short"
    elif days <= 45:
        return "medium"
    else:
        return "long"

In [34]:
# apply it to dataset
sort_df["injury_type"] = sort_df["notes_clean"].apply(classify_injury_type)
sort_df["injury_duration"] = sort_df["days_injured"].apply(classify_duration)

In [35]:
sort_df.head()

Unnamed: 0,player_name,notes_clean,injured,activated,days_injured,injury_type,injury_duration
0,Al Ferrari,placed on disabled list,1962-10-25,1962-11-06,12.0,unspecified,medium
1,Al Ferrari,placed on disabled list with knee injury,1962-11-14,NaT,,joint,unknown
2,Bob Greacen,placed on il with sprained ankle,1969-10-15,1969-10-28,13.0,joint,medium
3,Merv Jackson,placed on il,1972-11-10,1972-11-28,18.0,unspecified,medium
4,Randy Denton,placed on il,1972-11-11,1972-12-16,35.0,unspecified,medium


In [36]:
# drop unspecified
unspecified_count = sort_df[sort_df['injury_type'] == 'unspecified'].count()
unspecified_count

Unnamed: 0,0
player_name,145398
notes_clean,145398
injured,145398
activated,141897
days_injured,141897
injury_type,145398
injury_duration,145398


In [37]:
sort_df = sort_df[sort_df['injury_type'] != 'unspecified']

In [39]:
sort_df.to_csv('cleaned_data.csv', index=False)