# Passenger Satisfaction Classification
This notebook uses pandas and scikit-learn to train a classifier for airline passenger satisfaction.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

## Load the data

In [None]:
train_df = pd.read_csv('train.csv').iloc[:, 1:]
train_df = train_df.drop(columns=['id'])
train_df['Arrival Delay in Minutes'] = train_df['Arrival Delay in Minutes'].fillna(0)
train_df.head()

## Descriptive statistics

In [None]:
print('Average age:', train_df['Age'].mean())
print('Satisfaction counts:')
print(train_df['satisfaction'].value_counts())
train_df.describe()

## Train/test split

In [None]:
X = train_df.drop('satisfaction', axis=1)
y = train_df['satisfaction']

# Columns from the original script
numeric_features = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

categorical_features = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Build the preprocessing and modeling pipeline

In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

## Cross-validation

In [None]:
accuracy_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print('CV accuracy:', accuracy_scores.mean())

cv_results = cross_validate(
    clf, X, y, cv=5,
    scoring=['precision_macro', 'recall_macro']
)
print('CV precision:', cv_results['test_precision_macro'].mean())
print('CV recall:', cv_results['test_recall_macro'].mean())

## Train the model

In [None]:
clf.fit(X_train, y_train)

## Evaluate

In [None]:
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Random Forest

In [None]:
rf_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
])
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

## Support Vector Machine

In [None]:
svm_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC())
])
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

### Summary
The cross-validation results above report mean accuracy (from cross_val_score), precision and recall for logistic regression. Metrics for Random Forest and SVM are also displayed.