# Passenger Satisfaction Classification
This notebook trains several classifiers to predict airline passenger satisfaction.

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

## Load the data

In [None]:
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
train_df = pd.read_csv(TRAIN_PATH).iloc[:, 1:]
train_df = train_df.drop(columns=['id'])
test_df = pd.read_csv(TEST_PATH).iloc[:, 1:]
test_df = test_df.drop(columns=['id'])

### Inspect missing values

In [None]:
missing = train_df.isna().sum()
print(missing[missing>0])

## Descriptive statistics

In [None]:
print(train_df['satisfaction'].value_counts())
train_df.describe(include='all')

## Train/test split

In [None]:
test_size = 0.2
random_state = 42
X = train_df.drop('satisfaction', axis=1)
y = train_df['satisfaction']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

## Build preprocessing and modeling pipelines

In [None]:
numeric_features = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
categorical_features = [col for col in X.columns if col not in numeric_features]

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])

log_reg = Pipeline([('preprocessor', preprocessor), ('model', LogisticRegression(max_iter=1000))])
rf = Pipeline([('preprocessor', preprocessor), ('model', RandomForestClassifier(random_state=random_state))])

### Cross-validation

In [None]:
for name, model in [('LogisticRegression', log_reg), ('RandomForest', rf)]:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(name, 'CV accuracy: %.4f +- %.4f' % (scores.mean(), scores.std()))

### Hyperparameter tuning

In [None]:
param_grid = {
    'model__C': [0.1, 1.0, 10.0]
}
search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
search.fit(X_train, y_train)
print('Best parameters:', search.best_params_)
print('Best CV accuracy:', search.best_score_)

## Train final model and evaluate

In [None]:
best_model = search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)
print('Validation accuracy:', accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

### Evaluate on test.csv

In [None]:
X_test = test_df.drop('satisfaction', axis=1)
y_test = test_df['satisfaction']
y_test_pred = best_model.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_test_pred))

## Save the trained model

In [None]:
joblib.dump(best_model, 'best_model.joblib')

## Predict on new data

In [None]:
def predict_new(csv_path, model_path='best_model.joblib'):
    model = joblib.load(model_path)
    df = pd.read_csv(csv_path).iloc[:, 1:]
    df = df.drop(columns=['id'])
    return model.predict(df)

# Example usage:
# preds = predict_new('some_new_data.csv')