In [17]:
#%pip install xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from collections import Counter

import joblib

In [11]:
def replace_rare_categories(df, threshold=0.001, replacement='Other'):
    
    df_copy = df.copy()
    total_rows = len(df_copy)
    
    # Get categorical columns
    cat_cols = df_copy.select_dtypes(include=['object']).columns
    cat_cols = cat_cols.drop(["AIRLINE"])
    
    for column in cat_cols:
        # Count occurrences of each category
        counts = Counter(df_copy[column])
        
        # Find rare categories
        rare_categories = {cat for cat, freq in counts.items() if freq / total_rows < threshold}
        
        # Replace rare categories with 'replacement'
        df_copy[column] = df_copy[column].apply(lambda x: replacement if x in rare_categories else x)
    
    return df_copy


In [5]:
numerical_cols = ['DISTANCE','ori_TMIN','ori_TMAX','ori_SNOW','ori_SNWD','ori_AWND','ori_PRCP','dest_TMIN','dest_TMAX','dest_SNOW','dest_SNWD',
    'dest_AWND','dest_PRCP','year','hour','day_of_week','month','day']
categorical_cols = ['AIRLINE','ORIGIN','DEST']

In [21]:
w_df = pd.read_csv("rileys_data.csv")

X = w_df.drop(columns=['ORIGIN_CITY',"DEST_CITY",'15_DELAYED','DEP_DELAY','ARR_DELAY','CRS_ARR_TIME',
                     'CANCELLED','DIVERTED','ORIGIN_CITY','DEST_CITY','CRS_DEP_Datetime'])

X.to_csv('input2.csv')
y = w_df['15_DELAYED']
X.head()


Unnamed: 0,AIRLINE,ORIGIN,DEST,DISTANCE,ori_TMIN,ori_TMAX,ori_SNOW,ori_SNWD,ori_AWND,ori_PRCP,...,dest_TMAX,dest_SNOW,dest_SNWD,dest_AWND,dest_PRCP,year,month,day,day_of_week,hour
0,Spirit Air Lines,MCO,DFW,985.0,117.0,256.0,0.0,0.0,32.0,0.0,...,156.0,0.0,0.0,55.0,8.0,2020,2,23,6,18
1,Delta Air Lines Inc.,ATL,BDL,859.0,222.0,294.0,0.0,0.0,35.0,122.0,...,250.0,0.0,0.0,30.0,0.0,2021,6,11,4,18
2,American Airlines Inc.,RDU,DFW,1061.0,250.0,333.0,0.0,0.0,40.0,0.0,...,328.0,0.0,0.0,45.0,0.0,2020,7,31,4,12
3,Delta Air Lines Inc.,BDL,ATL,859.0,39.0,67.0,0.0,0.0,35.0,3.0,...,178.0,0.0,0.0,36.0,0.0,2019,11,20,2,6
4,Southwest Airlines Co.,BWI,BDL,283.0,67.0,189.0,0.0,0.0,20.0,56.0,...,211.0,0.0,0.0,26.0,0.0,2022,5,1,6,17


In [13]:
w_df_processed = replace_rare_categories(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1004691 entries, 698408 to 156845
Data columns (total 21 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   AIRLINE      1004691 non-null  object 
 1   ORIGIN       1004691 non-null  object 
 2   DEST         1004691 non-null  object 
 3   DISTANCE     1004691 non-null  float64
 4   ori_TMIN     1004691 non-null  float64
 5   ori_TMAX     1004691 non-null  float64
 6   ori_SNOW     1004691 non-null  float64
 7   ori_SNWD     1004691 non-null  float64
 8   ori_AWND     1004691 non-null  float64
 9   ori_PRCP     1004691 non-null  float64
 10  dest_TMIN    1004691 non-null  float64
 11  dest_TMAX    1004691 non-null  float64
 12  dest_SNOW    1004691 non-null  float64
 13  dest_SNWD    1004691 non-null  float64
 14  dest_AWND    1004691 non-null  float64
 15  dest_PRCP    1004691 non-null  float64
 16  year         1004691 non-null  int64  
 17  month        1004691 non-null  int64  
 18  day

In [16]:
class_counts = y_train.value_counts()
ratio = class_counts[0] / class_counts[1]

model = model = XGBClassifier(n_estimators=100, objective='binary:logistic', scale_pos_weight = ratio,random_state=0, n_jobs=-1)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Pipeline for preprocessing categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])  # KNN

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.69      0.77    196021
           1       0.37      0.65      0.48     55152

    accuracy                           0.68    251173
   macro avg       0.62      0.67      0.62    251173
weighted avg       0.77      0.68      0.71    251173



In [15]:
feature_importances = model.feature_importances_
feature_names = X_train.columns  # Assuming you have access to the column names
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importances:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")

Feature Importances:
dest_AWND: 0.0341116264462471
dest_SNWD: 0.030365796759724617
DISTANCE: 0.021400900557637215
ori_SNOW: 0.019303273409605026
year: 0.015069186687469482
day_of_week: 0.014975691214203835
dest_SNOW: 0.01400858722627163
AIRLINE: 0.009763914160430431
hour: 0.00900306086987257
DEST: 0.007442935835570097
ori_SNWD: 0.007199558895081282
ORIGIN: 0.00673298817127943
dest_PRCP: 0.006184051278978586
month: 0.005846087820827961
day: 0.004676729440689087
ori_PRCP: 0.004166899714618921
dest_TMAX: 0.003820377169176936
ori_AWND: 0.0037826013285666704
ori_TMIN: 0.003774924436584115
ori_TMAX: 0.003766418667510152
dest_TMIN: 0.002789936261251569


In [18]:
joblib.dump(pipeline, 'xgboost_pipe.joblib')

['xgboost_pipe.joblib']