In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from collections import Counter

print("worked")


worked


In [9]:
df = pd.read_csv("new_data.csv")

In [4]:
df['combined_datetime'] = pd.to_datetime(df['combined_datetime'])
df['hour_of_day'] = df['combined_datetime'].dt.hour
df['day_of_week'] = df['combined_datetime'].dt.dayofweek
df['month_of_year'] = df['combined_datetime'].dt.month
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 17 columns):
 #   Column             Dtype         
---  ------             -----         
 0   AIRLINE            object        
 1   FL_NUMBER          int64         
 2   ORIGIN             object        
 3   ORIGIN_CITY        object        
 4   DEST               object        
 5   DEST_CITY          object        
 6   DEP_DELAY          float64       
 7   ARR_DELAY          float64       
 8   CANCELLED          float64       
 9   DIVERTED           float64       
 10  DISTANCE           float64       
 11  DELAYED            int64         
 12  combined_datetime  datetime64[ns]
 13  FL_DATETIME        object        
 14  hour_of_day        int32         
 15  day_of_week        int32         
 16  month_of_year      int32         
dtypes: datetime64[ns](1), float64(5), int32(3), int64(2), object(6)
memory usage: 354.8+ MB


In [10]:

def replace_rare_categories(df, threshold=0.01, replacement='Other'):
    
    df_copy = df.copy()
    total_rows = len(df_copy)
    
    # Get categorical columns
    cat_cols = df_copy.select_dtypes(include=['object']).columns
    cat_cols = cat_cols.drop(["AIRLINE"])
    
    for column in cat_cols:
        # Count occurrences of each category
        counts = Counter(df_copy[column])
        
        # Find rare categories
        rare_categories = {cat for cat, freq in counts.items() if freq / total_rows < threshold}
        
        # Replace rare categories with 'replacement'
        df_copy[column] = df_copy[column].apply(lambda x: replacement if x in rare_categories else x)
    
    return df_copy


In [11]:
X = df.drop(columns=['ORIGIN_CITY',"DEST_CITY",'DELAYED','FL_NUMBER','DEP_DELAY','ARR_DELAY','CANCELLED','DIVERTED','combined_datetime','FL_DATETIME','ORIGIN_CITY','DEST_CITY'])
y = df['DELAYED']

numerical_cols = ['DISTANCE','hour_of_day','day_of_week','month_of_year']
categorical_cols = ['AIRLINE','ORIGIN','DEST']

# Replace infrequent categories with 'Other'
df_processed = replace_rare_categories(X)

df_processed.head()


X_train, X_test, y_train, y_test = train_test_split(df_processed, y, test_size=0.2, random_state=44)



In [12]:



numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])
model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # Logistic regression model


In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))


In [None]:
#Not bad but underperforms on delayed class, forgot to add class balancing 

In [None]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced',max_depth=None, random_state=0)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # Logistic regression model

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))


In [13]:
# I realized that I did not include weather data, lets include that 

w_df = pd.read_csv("rileys_data.csv")

X = w_df.drop(columns=['ORIGIN_CITY',"DEST_CITY",'15_DELAYED','DEP_DELAY','ARR_DELAY','CRS_ARR_TIME',
                     'CANCELLED','DIVERTED','ORIGIN_CITY','DEST_CITY','CRS_DEP_Datetime'])
y = y = w_df['15_DELAYED']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255864 entries, 0 to 1255863
Data columns (total 21 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   AIRLINE      1255864 non-null  object 
 1   ORIGIN       1255864 non-null  object 
 2   DEST         1255864 non-null  object 
 3   DISTANCE     1255864 non-null  float64
 4   ori_TMIN     1255864 non-null  float64
 5   ori_TMAX     1255864 non-null  float64
 6   ori_SNOW     1255864 non-null  float64
 7   ori_SNWD     1255864 non-null  float64
 8   ori_AWND     1255864 non-null  float64
 9   ori_PRCP     1255864 non-null  float64
 10  dest_TMIN    1255864 non-null  float64
 11  dest_TMAX    1255864 non-null  float64
 12  dest_SNOW    1255864 non-null  float64
 13  dest_SNWD    1255864 non-null  float64
 14  dest_AWND    1255864 non-null  float64
 15  dest_PRCP    1255864 non-null  float64
 16  year         1255864 non-null  int64  
 17  month        1255864 non-null  int64  
 18  da

In [14]:
numerical_cols = ['DISTANCE','ori_TMIN','ori_TMAX','ori_SNOW','ori_SNWD','ori_AWND','ori_PRCP','dest_TMIN','dest_TMAX','dest_SNOW','dest_SNWD',
    'dest_AWND','dest_PRCP','year','hour','day_of_week','month','day']
categorical_cols = ['AIRLINE','ORIGIN','DEST']

In [16]:
def subsample_dataframe(df, fraction=0.5, random_state=None):
    """
    Subsample a DataFrame by randomly selecting a fraction of rows.
    
    Parameters:
        df (DataFrame): The input DataFrame.
        fraction (float): The fraction of rows to select. Should be between 0 and 1. Default is 0.5 (50%).
        random_state (int): Seed for the random number generator. Default is None.
    
    Returns:
        DataFrame: A subsampled DataFrame containing a fraction of the original rows.
    """
    return df.sample(frac=fraction, random_state=random_state)

In [17]:
w_df_processed = replace_rare_categories(X)

w_df_proc_subsampled = subsample_dataframe(w_df_processed, fraction=0.5, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(w_df_processed, y, test_size=0.2, random_state=44)

X_train.head()


Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
698408,Alaska Airlines Inc.,SEA,LAS,867.0,139.0,261.0,0.0,0.0,23.0,0.0,...,472.0,0.0,0.0,21.0,0.0,2021,7,10,5,15
779197,PSA Airlines Inc.,CLT,Other,651.0,228.0,333.0,0.0,0.0,32.0,163.0,...,228.0,0.0,0.0,46.0,0.0,2019,7,22,0,16
815988,Republic Airline,JFK,RDU,427.0,106.0,194.0,0.0,0.0,68.0,0.0,...,189.0,0.0,0.0,29.0,0.0,2022,10,27,3,11
49229,United Air Lines Inc.,EWR,Other,628.0,-82.0,44.0,0.0,0.0,38.0,0.0,...,150.0,0.0,0.0,22.0,0.0,2022,1,12,2,14
232737,Southwest Airlines Co.,STL,MCI,237.0,61.0,111.0,0.0,0.0,33.0,0.0,...,139.0,0.0,0.0,45.0,0.0,2022,11,28,0,22


In [18]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=None, n_jobs=-1, random_state=0)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # Logistic regression model

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.81      0.97      0.88    196021
           1       0.63      0.16      0.26     55152

    accuracy                           0.80    251173
   macro avg       0.72      0.57      0.57    251173
weighted avg       0.77      0.80      0.74    251173



In [11]:
features = pipeline[:-1].get_feature_names_out()
len(features)

47

In [14]:
importances = model.feature_importances_
imp_df = pd.DataFrame({'Feature':features,'Importance':importances})
imp_df = imp_df.sort_values(by='Importance', ascending=False)
imp_df

Unnamed: 0,Feature,Importance
14,num__hour,0.097692
0,num__DISTANCE,0.084489
11,num__dest_AWND,0.074918
5,num__ori_AWND,0.073467
8,num__dest_TMAX,0.0711
2,num__ori_TMAX,0.070996
7,num__dest_TMIN,0.070646
1,num__ori_TMIN,0.07011
17,num__day,0.06352
15,num__day_of_week,0.039934


In [1]:
# feature importances are all very low, maybe the 'other' is resulting in losing too much data
#try with different threshold?
threshold = 0.01

w_df_processed = replace_infrequent_categories(X, categorical_cols, threshold = 0.01)
X_train, X_test, y_train, y_test = train_test_split(w_df_processed, y, test_size=0.2, random_state=44)

w_df_processed.head()






NameError: name 'replace_infrequent_categories' is not defined

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

In [18]:
# identifcal score, I should have more columns though? 
features = pipeline[:-1].get_feature_names_out()
len(features)

47

In [None]:
#feature importance
features = pipeline[:-1].get_feature_names_out()
importances = model.coef_[0]
imp_df = pd.DataFrame({'Feature':features,'Importance':importances})
imp_df = imp_df.sort_values(by='Importance', ascending=False)
imp_df.head(15)