# SEER Breast Cancer — GroupBy & Ranking Exploration
This notebook loads the CSV into SQLite (in-memory), runs group-by and ranking queries, and plots 8 key analyses with matplotlib only.

In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt

csv_path = 'SEER_Breast_Cancer_Dataset.csv'  # adjust if needed
df = pd.read_csv(csv_path)
conn = sqlite3.connect(':memory:')
df.to_sql('seer_breast_cancer', conn, if_exists='replace', index=False)

def bar_plot(categories, values, title, xlabel, ylabel, rotation=45):
    plt.figure(figsize=(10, 6))
    plt.bar(categories, values)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=rotation)
    plt.tight_layout()
    plt.show()


## 1) Alive vs Dead counts

In [None]:
q1 = '''
SELECT status, COUNT(*) AS cnt
FROM seer_breast_cancer
GROUP BY status
ORDER BY cnt DESC;'''
d1 = pd.read_sql(q1, conn)
bar_plot(d1['status'], d1['cnt'], 'Alive vs Dead', 'status', 'count', rotation=0)


## 2) Average tumor_size by grade

In [None]:
q2 = '''
SELECT grade, AVG(tumor_size) AS avg_tumor
FROM seer_breast_cancer
GROUP BY grade
ORDER BY avg_tumor DESC;'''
d2 = pd.read_sql(q2, conn)
bar_plot(d2['grade'], d2['avg_tumor'], 'Average Tumor Size by Grade', 'grade', 'avg_tumor')


## 3) Average survival_months by marital_status

In [None]:
q3 = '''
SELECT marital_status, AVG(survival_months) AS avg_survival
FROM seer_breast_cancer
GROUP BY marital_status
ORDER BY avg_survival DESC;'''
d3 = pd.read_sql(q3, conn)
bar_plot(d3['marital_status'], d3['avg_survival'], 'Average Survival by Marital Status', 'marital_status', 'avg_survival')


## 4) Average tumor_size by t_stage

In [None]:
q4 = '''
SELECT t_stage, AVG(tumor_size) AS avg_tumor
FROM seer_breast_cancer
GROUP BY t_stage
ORDER BY avg_tumor DESC;'''
d4 = pd.read_sql(q4, conn)
bar_plot(d4['t_stage'], d4['avg_tumor'], 'Average Tumor Size by T Stage', 't_stage', 'avg_tumor', rotation=0)


## 5) Average survival_months by race

In [None]:
q5 = '''
SELECT race, AVG(survival_months) AS avg_survival
FROM seer_breast_cancer
GROUP BY race
ORDER BY avg_survival DESC;'''
d5 = pd.read_sql(q5, conn)
bar_plot(d5['race'], d5['avg_survival'], 'Average Survival by Race', 'race', 'avg_survival')


## 6) Ranking — Top 3 tumor_size within each grade

In [None]:
q6 = '''
SELECT age, grade, tumor_size,
       RANK() OVER (PARTITION BY grade ORDER BY tumor_size DESC) AS rnk
FROM seer_breast_cancer;'''
d6 = pd.read_sql(q6, conn)
d6_top3 = d6[d6['rnk'] <= 3]
d6_top3.head(20)


## 7) Ranking — Top patient per race by survival_months

In [None]:
q7 = '''
WITH ranked AS (
  SELECT age, race, survival_months,
         RANK() OVER (PARTITION BY race ORDER BY survival_months DESC) AS rnk
  FROM seer_breast_cancer
)
SELECT race, survival_months
FROM ranked
WHERE rnk = 1
ORDER BY survival_months DESC;'''
d7 = pd.read_sql(q7, conn)
bar_plot(d7['race'], d7['survival_months'], 'Top Survival — Best per Race', 'race', 'top_survival_months')


## 8) Average reginol_node_positive by n_stage

In [None]:
q8 = '''
SELECT n_stage, AVG(reginol_node_positive) AS avg_pos
FROM seer_breast_cancer
GROUP BY n_stage
ORDER BY avg_pos DESC;'''
d8 = pd.read_sql(q8, conn)
bar_plot(d8['n_stage'], d8['avg_pos'], 'Average Positive Nodes by N Stage', 'n_stage', 'avg_positive_nodes', rotation=0)


In [None]:
conn.close()
print('Done.')
