In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

In [6]:
def replace_rare_categories(df, threshold=0.05, replacement='Other'):
    
    df_copy = df.copy()
    total_rows = len(df_copy)
    
    # Get categorical columns
    cat_cols = df_copy.select_dtypes(include=['object']).columns
    cat_cols = cat_cols.drop(["AIRLINE"])
    
    for column in cat_cols:
        # Count occurrences of each category
        counts = Counter(df_copy[column])
        
        # Find rare categories
        rare_categories = {cat for cat, freq in counts.items() if freq / total_rows < threshold}
        
        # Replace rare categories with 'replacement'
        df_copy[column] = df_copy[column].apply(lambda x: replacement if x in rare_categories else x)
    
    return df_copy


In [3]:
numerical_cols = ['DISTANCE','ori_TMIN','ori_TMAX','ori_SNOW','ori_SNWD','ori_AWND','ori_PRCP','dest_TMIN','dest_TMAX','dest_SNOW','dest_SNWD',
    'dest_AWND','dest_PRCP','year','hour','day_of_week','month','day']
categorical_cols = ['AIRLINE','ORIGIN','DEST']

In [4]:
w_df = pd.read_csv("rileys_data.csv")

X = w_df.drop(columns=['ORIGIN_CITY',"DEST_CITY",'15_DELAYED','DEP_DELAY','ARR_DELAY','CRS_ARR_TIME',
                     'CANCELLED','DIVERTED','ORIGIN_CITY','DEST_CITY','CRS_DEP_Datetime'])
y = w_df['15_DELAYED']
X.head()


Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
0,Spirit Air Lines,MCO,DFW,985.0,117.0,256.0,0.0,0.0,32.0,0.0,...,156.0,0.0,0.0,55.0,8.0,2020,2,23,6,18
1,Delta Air Lines Inc.,ATL,BDL,859.0,222.0,294.0,0.0,0.0,35.0,122.0,...,250.0,0.0,0.0,30.0,0.0,2021,6,11,4,18
2,American Airlines Inc.,RDU,DFW,1061.0,250.0,333.0,0.0,0.0,40.0,0.0,...,328.0,0.0,0.0,45.0,0.0,2020,7,31,4,12
3,Delta Air Lines Inc.,BDL,ATL,859.0,39.0,67.0,0.0,0.0,35.0,3.0,...,178.0,0.0,0.0,36.0,0.0,2019,11,20,2,6
4,Southwest Airlines Co.,BWI,BDL,283.0,67.0,189.0,0.0,0.0,20.0,56.0,...,211.0,0.0,0.0,26.0,0.0,2022,5,1,6,17


In [7]:
w_df_processed = replace_rare_categories(X)

X_train, X_test, y_train, y_test = train_test_split(w_df_processed, y, test_size=0.2, random_state=44)

X_train.head()

Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
698408,Alaska Airlines Inc.,Other,Other,867.0,139.0,261.0,0.0,0.0,23.0,0.0,...,472.0,0.0,0.0,21.0,0.0,2021,7,10,5,15
779197,PSA Airlines Inc.,CLT,Other,651.0,228.0,333.0,0.0,0.0,32.0,163.0,...,228.0,0.0,0.0,46.0,0.0,2019,7,22,0,16
815988,Republic Airline,Other,Other,427.0,106.0,194.0,0.0,0.0,68.0,0.0,...,189.0,0.0,0.0,29.0,0.0,2022,10,27,3,11
49229,United Air Lines Inc.,Other,Other,628.0,-82.0,44.0,0.0,0.0,38.0,0.0,...,150.0,0.0,0.0,22.0,0.0,2022,1,12,2,14
232737,Southwest Airlines Co.,Other,Other,237.0,61.0,111.0,0.0,0.0,33.0,0.0,...,139.0,0.0,0.0,45.0,0.0,2022,11,28,0,22


In [15]:
model = KNeighborsClassifier(n_neighbors=3)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # KNN

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85    196021
           1       0.41      0.28      0.33     55152

    accuracy                           0.75    251173
   macro avg       0.61      0.58      0.59    251173
weighted avg       0.73      0.75      0.74    251173



array(['num__DISTANCE', 'num__ori_TMIN', 'num__ori_TMAX', 'num__ori_SNOW',
       'num__ori_SNWD', 'num__ori_AWND', 'num__ori_PRCP',
       'num__dest_TMIN', 'num__dest_TMAX', 'num__dest_SNOW',
       'num__dest_SNWD', 'num__dest_AWND', 'num__dest_PRCP', 'num__year',
       'num__hour', 'num__day_of_week', 'num__month', 'num__day',
       'cat__AIRLINE_Alaska Airlines Inc.', 'cat__AIRLINE_Allegiant Air',
       'cat__AIRLINE_American Airlines Inc.',
       'cat__AIRLINE_Delta Air Lines Inc.',
       'cat__AIRLINE_Endeavor Air Inc.', 'cat__AIRLINE_Envoy Air',
       'cat__AIRLINE_ExpressJet Airlines LLC d/b/a aha!',
       'cat__AIRLINE_Frontier Airlines Inc.', 'cat__AIRLINE_Horizon Air',
       'cat__AIRLINE_JetBlue Airways', 'cat__AIRLINE_Mesa Airlines Inc.',
       'cat__AIRLINE_PSA Airlines Inc.', 'cat__AIRLINE_Republic Airline',
       'cat__AIRLINE_SkyWest Airlines Inc.',
       'cat__AIRLINE_Southwest Airlines Co.',
       'cat__AIRLINE_Spirit Air Lines',
       'cat__AIRLINE_Uni

AttributeError: 'KNeighborsClassifier' object has no attribute 'coef_'