In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [4]:
Data = "StudentPerformance.csv"

DataFlow = pd.read_csv(Data)
DataFlow.columns = [c.strip().lower().replace(" ", "-") for c in DataFlow.columns]
num = ["math-score", "reading-score", "writing-score"]
DataFlow[num].head()

Unnamed: 0,math-score,reading-score,writing-score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75


In [5]:
Total = pd.DataFrame({ "Mean" : DataFlow[num].mean(), "Median" : DataFlow[num].median(),
                      "Mode" : [DataFlow[c].mode().iloc[0] if not DataFlow[c].mode().empty else np.nan for c in num]})

Total["Mean_minus_Median"] = Total["Mean"] - Total["Median"]

Total.round(3)

Unnamed: 0,Mean,Median,Mode,Mean_minus_Median
math-score,66.089,66.0,65,0.089
reading-score,69.169,70.0,72,-0.831
writing-score,68.054,69.0,74,-0.946


In [11]:
def SkewHint(row):
    if np.isclose(row["Mean"], row["Median"], atol=0.25): 
        return "~ Symmetric"
    elif row["Mean"] > row["Median"]:
        return "Right-skewed" 
    else:
        return "Left-skewed"

Total["Skewness(hint)"] = Total.apply(SkewHint, axis=1)
print("\n[1] Mean / Median / Mode")
print(Total.round(3))


[1] Mean / Median / Mode
                 Mean  Median  Mode  Mean_minus_Median Skewness(hint)
math-score     66.089    66.0    65              0.089    ~ Symmetric
reading-score  69.169    70.0    72             -0.831    Left-skewed
writing-score  68.054    69.0    74             -0.946    Left-skewed


In [13]:

Columns = ["math-score", "reading-score", "writing-score"]

Disp = pd.DataFrame({"Min":DataFlow[Columns].min(),
                     "Max":DataFlow[Columns].max(),
                     "Range":DataFlow[Columns].max()-DataFlow[Columns].min(),
                     "Variance":DataFlow[Columns].var(ddof=1),
                     "StdDev":DataFlow[Columns].std(ddof=1)}).rename_axis("Subject").reset_index()
print("\n[2] Range / Variance / Std Dev")
print(Disp.round(3))



[2] Range / Variance / Std Dev
         Subject  Min  Max  Range  Variance  StdDev
0     math-score    0  100    100   229.919  15.163
1  reading-score   17  100     83   213.166  14.600
2  writing-score   10  100     90   230.908  15.196


In [18]:
W = DataFlow["writing-score"].to_numpy()
skew_W = stats.skew(W, bias=False, nan_policy="omit")
kurt_W = stats.kurtosis(W, fisher=False, bias=False, nan_policy="omit")

def skew_text(v):
    if abs(v)<0.1:
        return "~Symmetric"
    elif v>0:
        return "Right Skewed"
    else:
        return "Left Skewed"
def Kurt_text(v):
    if abs(v-3)<0.2:
        return "=Normal peakedness"
    elif v>3:
        return "Leptokurtic(Sharper peak)"
    else:
        return "Platykurtic(flatter)"

Writing_Shape = pd.DataFrame([{ "Skewness": skew_W,
                               "Skewness_Interpretation": skew_text(skew_W),
                               "Kurtosis(Pearson)": kurt_W,
                               "Kurtosis_Interpretation": Kurt_text(kurt_W)}])
print("\n[3] Writing Skewness and Kurtosis")
print(Writing_Shape.round(3))
                               


[3] Writing Skewness and Kurtosis
   Skewness Skewness_Interpretation  Kurtosis(Pearson) Kurtosis_Interpretation
0    -0.289             Left Skewed              2.967      =Normal peakedness


In [19]:
Q1 = DataFlow["math-score"].quantile(0.25)
Q3 = DataFlow["math-score"].quantile(0.75)
IQR = Q3-Q1
Lower = Q1-1.5*IQR
Upper = Q3+1.5*IQR

Out_Mask = (DataFlow["math-score"] <Lower)| (DataFlow["math-score"]> Upper)
Out_Count = int(Out_Mask.sum())

IQR_Table = pd.DataFrame([{"Q1":Q1, "Q3":Q3, "IQR":IQR, "Lower_Fence": Lower, "Upper_Fence": Upper,
                           "Outliers_Count": Out_Count}])
print("\n[4] Math Outliers using IQR")
print(IQR_Table.round(3))


[4] Math Outliers using IQR
     Q1    Q3   IQR  Lower_Fence  Upper_Fence  Outliers_Count
0  57.0  77.0  20.0         27.0        107.0               8


In [20]:
Corr = DataFlow[num].corr(method="pearson").round(3)
print("\n[5] Pearson Correlation Matrix")
print(Corr)


[5] Pearson Correlation Matrix
               math-score  reading-score  writing-score
math-score          1.000          0.818          0.803
reading-score       0.818          1.000          0.955
writing-score       0.803          0.955          1.000


In [22]:
Pairs = [((a,b), Corr.loc[a,b]) for i,a in enumerate(num) for j,b in enumerate(num) if j>i]

Strongest = max(Pairs, key=lambda x: abs(x[1]))
print(f"\n Strongest Correlation : {Strongest[0][0]} ↔ {Strongest[0][1]} = {Strongest[1]}")


 Strongest Correlation : reading-score ↔ writing-score = 0.955
