In [2]:
# ==============================
# TASK 1: Dataset Understanding (Single Cell Script)
# Works for: Titanic Dataset / Students Performance Dataset
# ==============================

import pandas as pd
import numpy as np

# --------- CHANGE THESE TWO LINES ONLY ----------
FILE_PATH = "Titanic-Dataset.csv"              # e.g., "titanic.csv" or "StudentsPerformance.csv"
TARGET_COL = "Survived"                # e.g., Titanic: "Survived" | Students: "math score"
# -----------------------------------------------

# 1) Load dataset
df = pd.read_csv(FILE_PATH)

print("\n==============================")
print("1) DATA PREVIEW (HEAD & TAIL)")
print("==============================")
print("\nFirst 5 rows:")
display(df.head())
print("\nLast 5 rows:")
display(df.tail())

# 2) Dataset shape & columns
print("\n==============================")
print("2) DATASET SIZE & COLUMNS")
print("==============================")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print("\nColumn names:")
print(list(df.columns))

# 3) df.info() -> datatypes + non-null counts
print("\n==============================")
print("3) df.info() (DTYPES + NULLS)")
print("==============================")
df.info()

# 4) df.describe() -> stats for numerical + categorical
print("\n==============================")
print("4) df.describe() (NUMERICAL)")
print("==============================")
display(df.describe())

print("\n==============================")
print("4b) df.describe() (CATEGORICAL)")
print("==============================")
display(df.describe(include="object"))

# 5) Missing values
print("\n==============================")
print("5) MISSING VALUES")
print("==============================")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100
missing_summary = pd.DataFrame({
    "Missing Count": missing_counts,
    "Missing %": missing_percent.round(2)
}).sort_values("Missing Count", ascending=False)

display(missing_summary[missing_summary["Missing Count"] > 0])

if (missing_counts == 0).all():
    print("✅ No missing values found.")

# 6) Identify numerical vs categorical columns
print("\n==============================")
print("6) FEATURE TYPE SPLIT (AUTO)")
print("==============================")
num_cols = df.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

# 7) Binary feature detection (nunique == 2)
print("\n==============================")
print("7) BINARY FEATURES (nunique == 2)")
print("==============================")
binary_cols = [col for col in df.columns if df[col].nunique(dropna=True) == 2]
print("Binary columns:", binary_cols)

# 8) Unique values distribution in categorical columns
print("\n==============================")
print("8) UNIQUE VALUES IN CATEGORICAL FEATURES")
print("==============================")
for col in cat_cols:
    print(f"\n--- {col} ---")
    display(df[col].value_counts(dropna=False).head(15))  # top 15 for readability
    print(f"Unique values count: {df[col].nunique(dropna=False)}")

# 9) Target variable and input features
print("\n==============================")
print("9) TARGET & INPUT FEATURES")
print("==============================")

if TARGET_COL in df.columns:
    X = df.drop(columns=[TARGET_COL])
    y = df[TARGET_COL]
    print("✅ Target column found:", TARGET_COL)
    print("Input feature columns:", list(X.columns))
    print("\nTarget distribution (first check):")
    display(y.value_counts(dropna=False))

    # If classification-like target (few unique values), show imbalance %
    uniq = y.nunique(dropna=True)
    if uniq <= 10:
        print("\nTarget distribution % (imbalance check):")
        display((y.value_counts(normalize=True, dropna=False) * 100).round(2))
else:
    print(f"⚠️ TARGET_COL='{TARGET_COL}' not found in columns.")
    print("Available columns:", list(df.columns))

notes = []

# Missing values notes
total_missing = missing_counts.sum()
if total_missing > 0:
    notes.append(f"- Missing values present: {int(total_missing)} total missing entries. Imputation or dropping needed.")
else:
    notes.append("- No missing values detected.")

# Categorical encoding notes
if len(cat_cols) > 0:
    notes.append(f"- {len(cat_cols)} categorical columns detected. Encoding needed (One-Hot/Label/Ordinal).")
else:
    notes.append("- No categorical columns detected (encoding may not be needed).")

# High-cardinality hint
high_card = [c for c in cat_cols if df[c].nunique(dropna=True) > 20]
if high_card:
    notes.append(f"- High-cardinality categorical columns (may need feature engineering): {high_card}")

# Dataset size note
rows, cols = df.shape
if rows < 500:
    notes.append("- Dataset is small (<500 rows): good for learning, but may limit model generalization.")
elif rows < 5000:
    notes.append("- Dataset size is moderate: suitable for basic ML experiments.")
else:
    notes.append("- Dataset is large enough for many ML models.")

# Print notes
print("\n".join(notes))





1) DATA PREVIEW (HEAD & TAIL)

First 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Last 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q



2) DATASET SIZE & COLUMNS
Rows: 891, Columns: 12

Column names:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

3) df.info() (DTYPES + NULLS)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

4) df.describe() (NUMERICAL)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292



4b) df.describe() (CATEGORICAL)


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644



5) MISSING VALUES


Unnamed: 0,Missing Count,Missing %
Cabin,687,77.1
Age,177,19.87
Embarked,2,0.22



6) FEATURE TYPE SPLIT (AUTO)
Numerical columns: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

7) BINARY FEATURES (nunique == 2)
Binary columns: ['Survived', 'Sex']

8) UNIQUE VALUES IN CATEGORICAL FEATURES

--- Name ---


Name
Braund, Mr. Owen Harris                            1
Boulos, Mr. Hanna                                  1
Frolicher-Stehli, Mr. Maxmillian                   1
Gilinski, Mr. Eliezer                              1
Murdlin, Mr. Joseph                                1
Rintamaki, Mr. Matti                               1
Stephenson, Mrs. Walter Bertram (Martha Eustis)    1
Elsbury, Mr. William James                         1
Bourke, Miss. Mary                                 1
Chapman, Mr. John Henry                            1
Van Impe, Mr. Jean Baptiste                        1
Leitch, Miss. Jessie Wills                         1
Johnson, Mr. Alfred                                1
Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan")       1
Taussig, Miss. Ruth                                1
Name: count, dtype: int64

Unique values count: 891

--- Sex ---


Sex
male      577
female    314
Name: count, dtype: int64

Unique values count: 2

--- Ticket ---


Ticket
347082          7
CA. 2343        7
1601            7
3101295         6
CA 2144         6
347088          6
S.O.C. 14879    5
382652          5
LINE            4
PC 17757        4
17421           4
349909          4
113760          4
4133            4
113781          4
Name: count, dtype: int64

Unique values count: 681

--- Cabin ---


Cabin
NaN            687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
D                3
F33              3
E101             3
F2               3
B20              2
E67              2
C125             2
E24              2
B49              2
B77              2
Name: count, dtype: int64

Unique values count: 148

--- Embarked ---


Embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64

Unique values count: 4

9) TARGET & INPUT FEATURES
✅ Target column found: Survived
Input feature columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Target distribution (first check):


Survived
0    549
1    342
Name: count, dtype: int64


Target distribution % (imbalance check):


Survived
0    61.62
1    38.38
Name: proportion, dtype: float64

- Missing values present: 866 total missing entries. Imputation or dropping needed.
- 5 categorical columns detected. Encoding needed (One-Hot/Label/Ordinal).
- High-cardinality categorical columns (may need feature engineering): ['Name', 'Ticket', 'Cabin']
- Dataset size is moderate: suitable for basic ML experiments.
