In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from collections import Counter

In [17]:
def replace_rare_categories(df, threshold=0.05, replacement='Other'):
    
    df_copy = df.copy()
    total_rows = len(df_copy)
    
    # Get categorical columns
    cat_cols = df_copy.select_dtypes(include=['object']).columns
    cat_cols = cat_cols.drop(["AIRLINE"])
    
    for column in cat_cols:
        # Count occurrences of each category
        counts = Counter(df_copy[column])
        
        # Find rare categories
        rare_categories = {cat for cat, freq in counts.items() if freq / total_rows < threshold}
        
        # Replace rare categories with 'replacement'
        df_copy[column] = df_copy[column].apply(lambda x: replacement if x in rare_categories else x)
    
    return df_copy


In [12]:
numerical_cols = ['DISTANCE','ori_TMIN','ori_TMAX','ori_SNOW','ori_SNWD','ori_AWND','ori_PRCP','dest_TMIN','dest_TMAX','dest_SNOW','dest_SNWD',
    'dest_AWND','dest_PRCP','year','hour','day_of_week','month','day']
categorical_cols = ['AIRLINE','ORIGIN','DEST']

In [13]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])
model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # Logistic regression model


In [34]:
w_df = pd.read_csv("rileys_data.csv")

#subsample_df = w_df.sample(frac=0.5)

X = subsample_df.drop(columns=['ORIGIN_CITY',"DEST_CITY",'15_DELAYED','DEP_DELAY','ARR_DELAY','CRS_ARR_TIME',
                     'CANCELLED','DIVERTED','ORIGIN_CITY','DEST_CITY','CRS_DEP_Datetime'])
y = subsample_df['15_DELAYED']
X.head()

Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
264629,American Airlines Inc.,ORD,LGA,733.0,-138.0,-55.0,0.0,150.0,34.0,0.0,...,-10.0,206.0,130.0,102.0,124.0,2022,1,29,5,15
589851,Delta Air Lines Inc.,DTW,IAH,1075.0,106.0,178.0,0.0,0.0,70.0,0.0,...,339.0,0.0,0.0,39.0,0.0,2022,9,26,0,19
732990,Southwest Airlines Co.,CLE,ATL,554.0,50.0,183.0,0.0,0.0,45.0,0.0,...,233.0,0.0,0.0,30.0,0.0,2022,10,9,6,11
229369,JetBlue Airways,BDL,MCO,1050.0,111.0,228.0,0.0,0.0,42.0,0.0,...,328.0,0.0,0.0,17.0,48.0,2022,9,15,3,17
1251798,Mesa Airlines Inc.,IAD,CLT,322.0,-21.0,61.0,0.0,0.0,26.0,0.0,...,144.0,0.0,0.0,35.0,0.0,2021,12,7,1,22


In [25]:
bool = (X['ORIGIN']=='ATL') | (X['DEST']=='ATL')
atl_data = X[bool]

In [35]:
w_df_processed = replace_rare_categories(X)

X_train, X_test, y_train, y_test = train_test_split(w_df_processed, y, test_size=0.2, random_state=44)

X_train.head()

Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
1150786,JetBlue Airways,JFK,JAX,828.0,39.0,100.0,0.0,0.0,58.0,0.0,...,306.0,0.0,0.0,54.0,307.0,2020,3,5,3,8
1058879,SkyWest Airlines Inc.,CLT,DEN,1337.0,172.0,294.0,0.0,0.0,33.0,0.0,...,350.0,0.0,0.0,38.0,0.0,2019,7,24,2,6
984266,Spirit Air Lines,CLE,ATL,554.0,150.0,283.0,0.0,0.0,43.0,0.0,...,278.0,0.0,0.0,47.0,0.0,2019,9,19,3,6
1023634,Southwest Airlines Co.,CMH,BWI,337.0,33.0,56.0,0.0,0.0,37.0,13.0,...,139.0,0.0,0.0,33.0,0.0,2023,1,18,2,5
345101,Republic Airline,CMH,DTW,155.0,-55.0,72.0,0.0,0.0,31.0,0.0,...,72.0,0.0,0.0,51.0,0.0,2021,1,13,2,14


In [20]:
w_df_processed["ORIGIN"].unique()

array(['ORD', 'DTW', 'CLE', 'BDL', 'IAD', 'OMA', 'HSV', 'LGA', 'DEN',
       'JFK', 'MKE', 'SEA', 'DAY', 'SLC', 'OKC', 'IAH', 'AUS', 'CLT',
       'BIL', 'SRQ', 'SAT', 'BWI', 'ATL', 'SJU', 'CVG', 'STL', 'TPA',
       'EWR', 'LEX', 'Other', 'MFR', 'SNA', 'JAX', 'GSO', 'MSO', 'ALB',
       'CHA', 'SDF', 'DFW', 'RDU', 'MCO', 'ABQ', 'ROC', 'LAS', 'BUF',
       'MCI', 'SAV', 'TUS', 'TUL', 'PHX', 'HOU', 'RNO', 'RSW', 'PDX',
       'ICT', 'SGF', 'MEM', 'ILM', 'TYS', 'ELP', 'LIT', 'CMH', 'MHT',
       'PBI', 'GSP', 'AVL', 'SFB', 'SYR', 'FWA', 'GEG', 'PGD', 'CHS',
       'BTV', 'MOB', 'XNA', 'MDW', 'MLI', 'OAK', 'FLL', 'GNV', 'PWM',
       'RIC', 'SBN', 'AGS', 'CRW', 'PNS', 'JAN', 'PIA', 'FAT', 'ONT',
       'ECP', 'BUR', 'ABE', 'MDT', 'FAR', 'SHV', 'BZN', 'ISP', 'PIE'],
      dtype=object)

In [7]:
## this is with 5%
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=None, n_jobs=-1, random_state=0)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # Logistic regression model

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.98      0.88     98214
           1       0.61      0.13      0.22     27373

    accuracy                           0.79    125587
   macro avg       0.71      0.55      0.55    125587
weighted avg       0.76      0.79      0.74    125587



In [8]:
features = pipeline[:-1].get_feature_names_out()
len(features)

47

In [9]:
importances = model.feature_importances_
imp_df = pd.DataFrame({'Feature':features,'Importance':importances})
imp_df = imp_df.sort_values(by='Importance', ascending=False)
imp_df

Unnamed: 0,Feature,Importance
14,num__hour,0.092089
0,num__DISTANCE,0.083046
11,num__dest_AWND,0.073973
5,num__ori_AWND,0.072842
2,num__ori_TMAX,0.070534
8,num__dest_TMAX,0.070362
7,num__dest_TMIN,0.069848
1,num__ori_TMIN,0.069793
17,num__day,0.064634
15,num__day_of_week,0.040327


In [21]:
#this is with .001

model = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=None, n_jobs=-1, random_state=0)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # Logistic regression model

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.98      0.88     98214
           1       0.62      0.12      0.20     27373

    accuracy                           0.79    125587
   macro avg       0.71      0.55      0.54    125587
weighted avg       0.76      0.79      0.73    125587



In [28]:
bool = (w_df['ORIGIN']=='ATL') | (w_df['DEST']=='ATL')
atl_data = w_df[bool]
atl_data.head()

Unnamed: 0,AIRLINE,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,DEP_DELAY,CRS_ARR_TIME,ARR_DELAY,CANCELLED,DIVERTED,...,dest_SNWD,dest_AWND,dest_PRCP,CRS_DEP_Datetime,15_DELAYED,year,month,day,day_of_week,hour
1,Delta Air Lines Inc.,ATL,"Atlanta, GA",BDL,"Hartford, CT",69.0,2112,60.0,0.0,0.0,...,0.0,30.0,0.0,2021-06-11 18:50:00,1,2021,6,11,4,18
3,Delta Air Lines Inc.,BDL,"Hartford, CT",ATL,"Atlanta, GA",-5.0,840,-32.0,0.0,0.0,...,0.0,36.0,0.0,2019-11-20 06:00:00,0,2019,11,20,2,6
24,Delta Air Lines Inc.,DTW,"Detroit, MI",ATL,"Atlanta, GA",2.0,1531,-3.0,0.0,0.0,...,0.0,44.0,0.0,2021-06-12 13:30:00,0,2021,6,12,5,13
36,Delta Air Lines Inc.,CMH,"Columbus, OH",ATL,"Atlanta, GA",-4.0,1220,-14.0,0.0,0.0,...,0.0,36.0,0.0,2020-08-16 10:50:00,0,2020,8,16,6,10
44,Delta Air Lines Inc.,CHS,"Charleston, SC",ATL,"Atlanta, GA",-8.0,1408,-21.0,0.0,0.0,...,0.0,48.0,0.0,2019-02-07 12:44:00,0,2019,2,7,3,12


In [29]:
atl_X = atl_data.drop(columns=['ORIGIN_CITY',"DEST_CITY",'15_DELAYED','DEP_DELAY','ARR_DELAY','CRS_ARR_TIME',
                     'CANCELLED','DIVERTED','ORIGIN_CITY','DEST_CITY','CRS_DEP_Datetime'])
atl_y = atl_data['15_DELAYED']

In [30]:
atl_X_train, atl_X_test, atl_y_train, atl_y_test = train_test_split(atl_X, atl_y, test_size=0.2, random_state=44)


In [31]:
pipeline.fit(atl_X_train, atl_y_train)
y_pred = pipeline.predict(atl_X_test)

print(classification_report(atl_y_test,atl_y_pred))

              precision    recall  f1-score   support

           0       0.83      0.98      0.90     32423
           1       0.58      0.14      0.22      7405

    accuracy                           0.82     39828
   macro avg       0.71      0.56      0.56     39828
weighted avg       0.79      0.82      0.77     39828



In [33]:
model = RandomForestClassifier(n_estimators=1000, class_weight='balanced', max_depth=None, n_jobs=-1, random_state=44)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.98      0.90     32423
           1       0.58      0.14      0.22      7405

    accuracy                           0.82     39828
   macro avg       0.71      0.56      0.56     39828
weighted avg       0.79      0.82      0.77     39828



Collecting imblearn
  Obtaining dependency information for imblearn from https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl.metadata
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/6e/1b/66764ecf370e797759be06fb38c11640ea1f9456ae10f8640a33cfa726ea/imbalanced_learn-0.12.2-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.2 imblearn-0.0

[

KeyboardInterrupt: 