In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('StudentsPerformance.csv')

In [None]:
print("Missing values per column:\n", df.isnull().sum())

In [None]:
df.info()

In [None]:
# Check for inconsistencies in categorical variables
categorical_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for col in categorical_cols:
    print(f"\nUnique values in '{col}':")
    print(df[col].unique())

In [None]:
# 4. Check for inconsistencies (e.g., negative or >100 scores)
num_cols = ['math score', 'reading score', 'writing score']

for col in num_cols:
    print(f"Inconsistent values in {col}:")
    print(df[(df[col] < 0) | (df[col] > 100)])

    # Fix: Set negatives to 0, values >100 to 100
    df.loc[df[col] < 0, col] = 0
    df.loc[df[col] > 100, col] = 100

In [None]:
# 2. DETECT OUTLIERS IN NUMERIC VARIABLES USING IQR
def detect_outliers(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f"{col}: {len(outliers)} outliers")
    return outliers, lower, upper

In [None]:
numeric_cols = ['math score', 'reading score', 'writing score']
for col in numeric_cols:
    detect_outliers(col)

In [None]:
# 6. Data transformation: Log transform "reading score" to reduce skewness
df['reading score_log'] = np.log(df['reading score'] + 1)  # add 1 to avoid log(0)

In [None]:
print("Skewness before transformation:", df['reading score'].skew())

In [None]:
# 7. Visualize before and after transformation
sns.histplot(df['reading score'], kde=True,color='blue')
plt.title('Original Reading Score')

In [None]:
# Check skewness after transformation
print("Skewness after transformation:", df['reading score_log'].skew())

In [None]:

sns.histplot(df['reading score_log'],kde=True, color='green')
plt.title('Log-Transformed Reading Score')

In [None]:
# Before Transformation: Visualize distribution and skewness
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['math score'], kde=True)
plt.title("Original Math Score Distribution")

In [None]:
# Apply Power Transformation (Yeo-Johnson handles 0 and negative values)
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')
df['math_score_transformed'] = pt.fit_transform(df[['math score']])

# After Transformation: Visualize transformed distribution
plt.subplot(1, 2, 2)
sns.histplot(df['math_score_transformed'], kde=True, color='green')
plt.title("Transformed Math Score (Yeo-Johnson)")
plt.tight_layout()
plt.show()