In [5]:
import pandas as pd

# DATAPATH
csv_file_path = "/kaggle/input/student-exam-performance-prediction/student_exam_data.csv"


df = pd.read_csv(csv_file_path)
print(df)


     Study Hours  Previous Exam Score  Pass/Fail
0       4.370861            81.889703          0
1       9.556429            72.165782          1
2       7.587945            58.571657          0
3       6.387926            88.827701          1
4       2.404168            81.083870          0
..           ...                  ...        ...
495     4.180170            45.494924          0
496     6.252905            95.038815          1
497     1.699612            48.209118          0
498     9.769553            97.014241          1
499     9.875897            66.760346          1

[500 rows x 3 columns]


In [7]:
import pandas as pd

# DATAPATH
csv_file_path = "/kaggle/input/student-exam-performance-prediction/student_exam_data.csv"

df = pd.read_csv(csv_file_path)

# REMOVE DUPLICATE ROWS
df.drop_duplicates(inplace=True)

# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# CHECK MISSING VALUES
print("Missing Values:")
print(df.isnull().sum())

# PRINT UNIQUE VALUES FOR CATEGORICAL COLUMNS
print("\nUnique Values for Categorical Columns:")
for column in df.select_dtypes(include=['object']).columns:
    print(f"{column}: {df[column].unique()}")




Missing Values:
Study Hours            0
Previous Exam Score    0
Pass/Fail              0
dtype: int64

Unique Values for Categorical Columns:


In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

# DATAPATH
csv_file_path = "/kaggle/input/student-exam-performance-prediction/student_exam_data.csv"

df = pd.read_csv(csv_file_path)

# REMOVE DUPLICATE ROWS
df.drop_duplicates(inplace=True)

# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# HANDLING OUTLIERS USING Z SCORE
z_scores = zscore(df.select_dtypes(include=['int64', 'float64']))
abs_z_scores = abs(z_scores)
outlier_rows = (abs_z_scores > 3).all(axis=1)
df = df[~outlier_rows]

# NORMALIZE AND SCALE
scaler = StandardScaler()  # Change to StandardScaler for standardization
df[df.columns] = scaler.fit_transform(df[df.columns])

# PRINT THE PREPROCESSED DATAFRAME
print("Preprocessed DataFrame:")
print(df)



# SPLIT DATA INTO FEATURE AND TARGET
X = df[['Study Hours', 'Previous Exam Score']]
y = df['Pass/Fail']

# SPLIT DATA INTO TRAINING AND TSTING SET 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#CHECK UNIQUE VALUES IN THE PASS FAIL COL 
unique_values = df['Pass/Fail'].unique()
print("Unique values in 'Pass/Fail' column:", unique_values)


print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Preprocessed DataFrame:
     Study Hours  Previous Exam Score  Pass/Fail
0      -0.415636             0.758080  -0.763072
1       1.515310             0.189844   1.310493
2       0.782307            -0.604555  -0.763072
3       0.335457             1.163515   1.310493
4      -1.147973             0.710990  -0.763072
..           ...                  ...        ...
495    -0.486644            -1.368719  -0.763072
496     0.285179             1.526474   1.310493
497    -1.410328            -1.210110  -0.763072
498     1.594671             1.641911   1.310493
499     1.634270            -0.126033   1.310493

[500 rows x 3 columns]
Unique values in 'Pass/Fail' column: [-0.76307174  1.31049277]
X_train shape: (400, 2)
X_test shape: (100, 2)
y_train shape: (400,)
y_test shape: (100,)


In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# DATAPATH
csv_file_path = "/kaggle/input/student-exam-performance-prediction/student_exam_data.csv"

df = pd.read_csv(csv_file_path)

# REMOVE DUPLICATE ROWS
df.drop_duplicates(inplace=True)

# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# HANDLING OUTLIERS USING Z SCORE
z_scores = zscore(df.select_dtypes(include=['int64', 'float64']))
abs_z_scores = abs(z_scores)
outlier_rows = (abs_z_scores > 3).all(axis=1)
df = df[~outlier_rows]

# NORMALIZE AND SCALE
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

#REVERT THE SCALING ON PASS\FAIL TO GET BACK ORGINAL BINARY VALUES 
df['Pass/Fail'] = (df['Pass/Fail'] > 0).astype(int)


# SPLIT DATA INTO FEATURE AND TARGET 
X = df[['Study Hours', 'Previous Exam Score']]
y = df['Pass/Fail']

# SPLIT THE DATA INTO TRAIN AND TEST 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LOGISTIC REGRESSION MODEL
logistic_model = LogisticRegression(random_state=42)

# TRAIN THE MODEL 
logistic_model.fit(X_train, y_train)

# MAKE PREDICTION ON TEST SET 
y_pred = logistic_model.predict(X_test)

# EVALUATE THE MODEL 
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.86
Confusion Matrix:
[[58  6]
 [ 8 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89        64
           1       0.82      0.78      0.80        36

    accuracy                           0.86       100
   macro avg       0.85      0.84      0.85       100
weighted avg       0.86      0.86      0.86       100

