# First we read the data and output first 5 rows

In [2]:
import pandas as pd

df = pd.read_csv("data/StudentPerformance.csv")

df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


# I want to find the grading system, i will find **min** and **max** of the _**Previous Scores**_ and _**Performance Index**_ column

In [6]:
import pandas as pd

min_PS = df["Previous Scores"].min()
max_PS = df["Previous Scores"].max()

min_PI = df["Performance Index"].min()
max_PI = df["Performance Index"].max()


min_PS, max_PS, min_PI, max_PI

(np.int64(40), np.int64(99), np.float64(10.0), np.float64(100.0))

# I assume it is 10 -> 100 grading system

## Target -> Performance Index
Tasks:
1. The shape and colums 
2. see how many students passed (Performance Index >= 50)
3. add a new _boolean_ column **Status** to see who passed and who did not
4. separate in X (Feautures) and y (Target)

In [9]:
df.shape, df.columns.tolist()

((10000, 6),
 ['Hours Studied',
  'Previous Scores',
  'Extracurricular Activities',
  'Sleep Hours',
  'Sample Question Papers Practiced',
  'Performance Index'])

In [12]:
passed_mask = df['Performance Index'] >= 50

passed_mask.sum(), (~passed_mask).sum()

(np.int64(5909), np.int64(4091))

In [13]:
df["Status"] = df['Performance Index'] >= 50

df["Status"].value_counts()

Status
True     5909
False    4091
Name: count, dtype: int64

In [None]:
df.head() #checking the column

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index,Status
0,7,99,Yes,9,1,91.0,True
1,4,82,No,4,2,65.0,True
2,8,51,Yes,7,2,45.0,False
3,5,52,Yes,5,2,36.0,False
4,7,75,No,8,5,66.0,True


In [16]:
X = df.drop(columns=["Performance Index", "Status"])
y = df["Status"]

X.shape, y.shape

((10000, 5), (10000,))

In [17]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,Yes,9,1
1,4,82,No,4,2
2,8,51,Yes,7,2
3,5,52,Yes,5,2
4,7,75,No,8,5


# train_test_split -> preprocess -> pipeline

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 7,
    stratify = y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 5), (2000, 5), (8000,), (2000,))

In [21]:
import numpy as np

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000))
])

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred)) # [[TN, FP],[FN, TP]]
print(classification_report(y_test, y_pred))

[[ 787   31]
 [  26 1156]]
              precision    recall  f1-score   support

       False       0.97      0.96      0.97       818
        True       0.97      0.98      0.98      1182

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



# Test Overfitting using **model.score**

In [26]:
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

train_acc, test_acc, train_acc - test_acc


(0.9745, 0.9715, 0.0030000000000000027)

# Test Overfitting using **cross-validation**

In [29]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cross_validation = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=14
)

In [33]:
cross_validation_scores = cross_val_score(
    model,
    X,
    y,
    cv=cross_validation,
    scoring="accuracy"
)

In [35]:
cross_validation_scores.mean(), cross_validation_scores.std()

(np.float64(0.974), np.float64(0.004393176527297748))