In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix, parallel_coordinates, andrews_curves, radviz
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# --------------------------
# User-editable path
DATA_PATH = "/content/ncr_ride_bookings.csv"   # <- change to your file path, e.g. "/content/ncr_ride_bookings.csv"
# --------------------------

# 1) Dataset Preparation
df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())1

# auto-detect target column: prefer common names, else take last column
possible_targets = ['target', 'class', 'label', 'Type']
target_col = None
for c in possible_targets:
    if c in df.columns:
        target_col = c
        break
if target_col is None:
    # fallback: if last column is non-numeric or low cardinality, use it; else still pick last column
    last_col = df.columns[-1]
    target_col = last_col
print("Using target column:", target_col)

# If target is numeric but small number of unique values, keep as categorical
if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() <= 10:
    df[target_col] = df[target_col].astype(str)

# select numeric features (exclude target)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

# Remove columns causing issues in scatter_matrix
cols_to_remove = ['Cancelled Rides by Customer', 'Cancelled Rides by Driver', 'Incomplete Rides']
numeric_cols_filtered = [col for col in numeric_cols if col not in cols_to_remove]


# For readability/selecting first N features
N_scatter_matrix = min(5, len(numeric_cols_filtered))
features_for_scatter = numeric_cols_filtered[:N_scatter_matrix]

# If not enough numeric features, pick some non-numeric converted ones (rare)
print("Numeric features available:", numeric_cols_filtered)
print("Using these for scatter_matrix (first {}): {}".format(N_scatter_matrix, features_for_scatter))

# 2) Scatterplot Matrix (pairwise relationships among first 5 features)
plt.figure(figsize=(12, 10))
sm = scatter_matrix(df[features_for_scatter + [target_col]], figsize=(12,12), diagonal='kde', alpha=0.6, marker='o')
# color points by class in off-diagonal scatterplots: do manual coloring if target categorical
if df[target_col].dtype == object or df[target_col].nunique() <= 12:
    classes = df[target_col].unique()
    palette = sns.color_palette("tab10", n_colors=len(classes))
    color_map = {c: palette[i] for i, c in enumerate(classes)}
    # recolor scatter subplots
    for i, j in zip(*np.triu_indices_from(sm, k=1)):
        pass  # nothing: scatter_matrix already plotted; advanced recolor would require replotting each ax
plt.suptitle("Scatterplot Matrix (first {} numeric features)".format(N_scatter_matrix), fontsize=16)
plt.show()

# 3) Parallel Coordinates Plot (first 6 features)
max_parallel = min(6, len(numeric_cols_filtered))
cols_parallel = numeric_cols_filtered[:max_parallel] + [target_col]
# For better visualization scale numeric features (z-score)
df_parallel = df[cols_parallel].copy()
scaler = StandardScaler()
df_parallel[numeric_cols_filtered[:max_parallel]] = scaler.fit_transform(df_parallel[numeric_cols_filtered[:max_parallel]])
plt.figure(figsize=(14,6))
parallel_coordinates(df_parallel, class_column=target_col, colormap=plt.get_cmap("tab10"), alpha=0.6)
plt.title("Parallel Coordinates Plot (first {} numeric features)".format(max_parallel))
plt.ylabel("Standardized value")
plt.xticks(rotation=20)
plt.legend(bbox_to_anchor=(1.02,1), loc='upper left')
plt.show()

# 4) Correlation Heatmap
plt.figure(figsize=(12,10))
corr = df[numeric_cols_filtered].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={'shrink':0.7})
plt.title("Correlation heatmap (numeric features)")
plt.show()

# 5) PCA (reduce to 2 components) + 2D scatter to check class separability
# Use standardized numeric features
X = df[numeric_cols_filtered].copy().dropna()
# Keep same indices to map back target
target_for_pca = df.loc[X.index, target_col].astype(str)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
explained = pca.explained_variance_ratio_

plt.figure(figsize=(10,7))
unique_classes = sorted(target_for_pca.unique())
palette = sns.color_palette("tab10", n_colors=max(3, len(unique_classes)))
for i, cls in enumerate(unique_classes):
    mask = target_for_pca == cls
    plt.scatter(X_pca[mask,0], X_pca[mask,1], label=str(cls), alpha=0.6, s=30)
plt.xlabel(f"PCA 1 ({explained[0]*100:.1f}% var)")
plt.ylabel(f"PCA 2 ({explained[1]*100:.1f}% var)")
plt.title("PCA (2 components) - projection")
plt.legend(bbox_to_anchor=(1.02,1), loc='upper left')
plt.grid(True)
plt.show()

# 6) Stacked Bar Chart: group by target class, mean of first 6 numeric features
mean_df = df.groupby(target_col)[numeric_cols_filtered[:6]].mean()
mean_df.plot(kind='bar', stacked=True, figsize=(12,7), colormap='tab20')
plt.title("Stacked Bar Chart of mean feature values by class (first 6 numeric features)")
plt.ylabel("Mean (original units)")
plt.legend(bbox_to_anchor=(1.02,1), loc='upper left')
plt.show()

# 7) Andrews Curves
plt.figure(figsize=(12,6))
# For andrews_curves skip rows with missing values in selected columns
cols_andrews = [target_col] + numeric_cols_filtered[:6]
andrews_curves(df[cols_andrews].dropna(), class_column=target_col, colormap='tab10', alpha=0.6)
plt.title("Andrews Curves (first 6 numeric features)")
plt.legend(bbox_to_anchor=(1.02,1), loc='upper left')
plt.show()

# 8) Radviz plot
plt.figure(figsize=(10,10))
cols_radviz = [target_col] + numeric_cols_filtered[:6]
radviz(df[cols_radviz].dropna(), class_column=target_col, colormap='tab10')
plt.title("Radviz (first 6 numeric features)")
plt.show()

# Optional: print PCA component loadings for interpretability
pca_components = pd.DataFrame(pca.components_.T, index=numeric_cols_filtered, columns=['PC1', 'PC2'])
print("\nPCA component loadings (top features by absolute value):")
display(pca_components.apply(lambda s: s.abs()).sort_values('PC1', ascending=False).head(6))

SyntaxError: invalid syntax (ipython-input-1694897089.py, line 17)

In [None]:
df.describe()

Unnamed: 0,Avg VTAT,Avg CTAT,Cancelled Rides by Customer,Cancelled Rides by Driver,Incomplete Rides,Booking Value,Ride Distance,Driver Ratings,Customer Rating
count,139500.0,102000.0,10500.0,27000.0,9000.0,102000.0,102000.0,93000.0,93000.0
mean,8.456352,29.149636,1.0,1.0,1.0,508.295912,24.637012,4.230992,4.404584
std,3.773564,8.902577,0.0,0.0,0.0,395.805774,14.002138,0.436871,0.437819
min,2.0,10.0,1.0,1.0,1.0,50.0,1.0,3.0,3.0
25%,5.3,21.6,1.0,1.0,1.0,234.0,12.46,4.1,4.2
50%,8.3,28.8,1.0,1.0,1.0,414.0,23.72,4.3,4.5
75%,11.3,36.8,1.0,1.0,1.0,689.0,36.82,4.6,4.8
max,20.0,45.0,1.0,1.0,1.0,4277.0,50.0,5.0,5.0
