# Syntax Candy: Select, Filter & Query
*Wrangling with expressive pandas patterns*

## 1. Load & Peek

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('/mnt/data/cces_example_data.csv', low_memory=False)
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all').T.head(20)

## 2. Column Selection: `[]`, `.loc`, `.iloc`, `.filter`, `select_dtypes`
We’ll pick columns by name patterns and data types.

In [None]:
# Quick single & multi-column selection
one = df['〈no_categorical_column〉'] if '〈no_categorical_column〉' in df.columns else None
subset = df[[c for c in ['〈no_categorical_column〉', 'age'] if c in df.columns]]
subset.head()

In [None]:
# Pattern-based column selection
df.filter(like="id").head()  # tweak 'id' to a useful substring in your dataset

In [None]:
# By dtype
numeric_only = df.select_dtypes(include="number").head()
numeric_only.head()

In [None]:
# Label vs position
first_five_cols = df.columns[:5].tolist()
df.loc[:10, first_five_cols]  # first 11 rows by label, selected columns

# Position-based (be careful with meaning)
df.iloc[:10, :5]

## 3. Row Filtering with Boolean Masks
Remember: use `&` and `|` (not `and`/`or`), and wrap conditions in parentheses.

In [None]:
# Equality / range (if columns exist)
mask_parts = []
if 'age' in df.columns:
    s = pd.to_numeric(df['age'], errors="coerce")
    q = s.quantile([0.25, 0.75]).tolist() if s.notna().any() else [None, None]
    if q[0] is not None:
        mask_parts.append((s.between(q[0], q[1])))
if '〈no_categorical_column〉' in df.columns:
    topcats = df['〈no_categorical_column〉'].dropna().value_counts().head(3).index.tolist()
    if topcats:
        mask_parts.append(df['〈no_categorical_column〉'].isin(topcats))
mask = None
for m in mask_parts:
    mask = m if mask is None else (mask & m)
filtered = df.loc[mask] if mask is not None else df.copy()
filtered.head()

In [None]:
# Not/Null-safe filtering
has_text = df.select_dtypes(include=["object","string","category"]).columns.tolist()
if has_text:
    col = has_text[0]
    df.loc[df[col].notna() & df[col].str.contains("a", case=False, na=False)].head()
else:
    df.head()

## 4. Dates & Text Helpers

In [None]:

# No natural date columns found; create a synthetic date spreading rows across months
N = len(df)
base = pd.Timestamp("2020-01-01")
df['synthetic_date'] = [base + pd.Timedelta(days=int(i % 730)) for i in range(N)]

# Use the date column
dcol = 'synthetic_date'
if dcol in df.columns:
    # Year / month filters
    year = df[dcol].dt.year
    month = df[dcol].dt.month
    # Keep a 2-year window if possible
    lo = int(year.dropna().quantile(0.10)) if year.notna().any() else 2020
    hi = int(year.dropna().quantile(0.90)) if year.notna().any() else 2021
    df_date_filtered = df.loc[year.between(lo, hi)]
    df_date_filtered.head()
else:
    df.head()

## 5. `DataFrame.query()` — SQL‑ish Filters
Use `@` to reference Python variables; backticks for odd column names.

In [None]:
# Prepare a small variable for membership
state_like = '〈no_state_like〉'
target_vals = []
if state_like in df.columns:
    target_vals = df[state_like].dropna().astype(str).value_counts().head(3).index.tolist()

# Example query (falls back safely if columns missing)
exprs = []
if 'age' in df.columns:
    exprs.append('age' + " >= 0")  # harmless lower bound
if state_like in df.columns and target_vals:
    exprs.append("`" + state_like + "` in @target_vals")
query_expr = " and ".join(exprs) if exprs else "index >= 0"
df.query(query_expr).head()

## 6. From Subset to Bar Chart

In [None]:
cat = '〈no_categorical_column〉'
if cat in df.columns:
    top = (
        df[cat]
        .astype(str)
        .pipe(lambda s: s[s.notna()])
        .value_counts()
        .head(10)
        .reset_index()
        .rename(columns={"index": cat, cat: "n"})
    )
    top.plot(kind="bar", x=cat, y="n", legend=False, rot=45, title=f"Top 10 values in {cat}")
else:
    import matplotlib.pyplot as plt
    plt.figure()
    plt.title("No categorical column available for bar chart")
    plt.plot([0,1,2],[1,2,3])


## 7. Common Pitfalls
- Use `&`/`|`, not `and`/`or`
- Always parenthesize combined conditions
- Prefer `.loc`/`.iloc` over chained indexing
- Null‑aware string filters: `na=False`
- Backticks in `query()` for awkward column names

## 8. Mini‑Exercises (Try Now)
1) **Membership:** pick a categorical column (e.g., `〈no_state_like〉` if present) and keep 2–3 values.
2) **Range:** choose a numeric column (e.g., `age`) and filter to its interquartile range.
3) **Text:** run a case‑insensitive `.str.contains()` on a text column.
4) **Dates:** subset the `synthetic_date` column to a 12‑month window and count rows per month.
5) **Query vs Masks:** implement (2) and (1) with both boolean masks and `.query()`.

> Add a final bar chart from your subset: `.groupby(col).size().reset_index(name='n')` then plot.
