In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA591/refs/heads/main/data/flights.csv')

In [3]:
df.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [5]:
df.DEST_AIR.value_counts()

Unnamed: 0_level_0,count
DEST_AIR,Unnamed: 1_level_1
LAX,1991
SFO,1637
ORD,1634
DEN,1581
DFW,1454
...,...
MMH,4
ITO,2
CEC,2
IMT,2


In [6]:
df.DEST_AIR.nunique()

271

In [7]:
## example we read only the response AND the categorical variable you want to reduce dimensions
## suppose in this case you want to predict the prob a flight is cancelled
## your predictor destination is catogorical with many levels (271 in this example)
yvar = 'CANCELLED'
xvar = 'DEST_AIR'
## you can name this dataset as you please
df_to_collapse = df[[yvar, xvar]]

## we one-hot the variable
df_to_collapse = pd.get_dummies(df_to_collapse)
df_to_collapse.head()

## next import the random forest
## use RandomForestClassifier if y is categorical
## use RandomForestRegressor if y is numeric
from sklearn.ensemble import RandomForestClassifier

## we fit the random forest
rf = RandomForestClassifier()
rf.fit(df_to_collapse.drop(yvar, axis = 1), df_to_collapse[yvar])

feature_importances = pd.DataFrame({'Variable' :df_to_collapse.drop(yvar, axis = 1).columns, 'Importance': rf.feature_importances_})
feature_importances.set_index('Variable', inplace = True)
feature_importances = feature_importances.sort_values(by = 'Importance', ascending = False)

In [8]:
## we can now check the feature importances
feature_importances

Unnamed: 0_level_0,Importance
Variable,Unnamed: 1_level_1
DEST_AIR_CEC,0.043690
DEST_AIR_GRB,0.041453
DEST_AIR_OTH,0.035108
DEST_AIR_CMI,0.026750
DEST_AIR_CMX,0.026444
...,...
DEST_AIR_INL,0.000015
DEST_AIR_MMH,0.000014
DEST_AIR_IMT,0.000007
DEST_AIR_ITO,0.000007


In [9]:
## you need to then decide how many of the most important levels to keep
## suppose you want to keep the top 4
## these are the airports identified to be the most important
## to predict the likelihood of the flight being cancelled
## it can be they are more likely or less likely than the rest

## create the dummy variables identified
df['DEST_AIR_CEC'] = (df["DEST_AIR"] == 'CEC').astype('int')
df['DEST_AIR_GRB'] = (df["DEST_AIR"] == "GRB").astype('int')
df['DEST_AIR_OTH'] = (df["DEST_AIR"] == 'OTH').astype('int')
df['DEST_AIR_CMI'] = (df["DEST_AIR"] == 'CMI').astype('int')

## remember to drop the original variable
df.drop('DEST_AIR', inplace = True, axis = 1)


In [10]:
## now your data set only contains the dummy encoded variables
df.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED,DEST_AIR_CEC,DEST_AIR_GRB,DEST_AIR_OTH,DEST_AIR_CMI
0,1,1,4,WN,LAX,1625,58.0,94.0,590,1905,65.0,0,0,0,0,0,0
1,1,1,4,UA,DEN,823,7.0,154.0,1452,1333,-13.0,0,0,0,0,0,0
2,1,1,4,MQ,DFW,1305,36.0,85.0,641,1453,35.0,0,0,0,0,0,0
3,1,1,4,AA,DFW,1555,7.0,126.0,1192,1935,-7.0,0,0,0,0,0,0
4,1,1,4,WN,LAX,1720,48.0,166.0,1363,2225,39.0,0,0,0,0,0,0
