# 1. Imports

## 1.1 Importing pandas profiling

Pandas profiling is a framework used in this project to facilitate the study of variables

In [1]:
%%capture
import sys
!{sys.executable} -m pip install -U pandas-profiling[notebook]
!jupyter nbextension enable --py widgetsnbextension

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import plotly.express as px
import plotly.io as pio

# Resize plots
plt.rcParams['figure.figsize'] = [22, 20]
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/students-performance-in-exams/StudentsPerformance.csv


# 2. Reading the data

In [3]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

### First glances on the dataset

In [4]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [6]:
profile = ProfileReport(df, title='Scores profiling report', explorative=True)
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [7]:
# Identifying if the dataset has null values

df.isnull().sum()

# Looks like our dataset has all rows duly filled.

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

# Distribution of the grades

In [8]:
fig = px.box(df, y="math score")
fig.show()

In [9]:
fig = px.histogram(df, x="math score")
fig.show()

In [10]:
# Distribution for reading score 

fig = px.box(df, y="reading score")
fig.show()


In [11]:
fig = px.histogram(df, x="reading score")
fig.show()

In [12]:
# Distribution for writing score 

fig = px.box(df, y="writing score")
fig.show()


In [13]:
fig = px.histogram(df, x="writing score")
fig.show()

With the df.describe() method and the bloxplots here plotted one can see that the distribution is very similar to each matter.

# Correlation

In [14]:
df.corr().style.background_gradient(cmap="Blues")

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.81758,0.802642
reading score,0.81758,1.0,0.954598
writing score,0.802642,0.954598,1.0


We can see that there is a good positive correlation among the variables.

# How's the proportion between men and women?

In [15]:
df2 = df['gender'].value_counts('gender')
print(df2)

female    0.518
male      0.482
Name: gender, dtype: float64


In [16]:
# Inserting the values above in a python dictionary.
proportion = {}

proportion['female'] = df[df['gender']=='female']['gender'].count()
proportion['male'] = df[df['gender']=='male']['gender'].count()

print(proportion)

{'female': 518, 'male': 482}


In [17]:
# Data to plot
labels = []
sizes = []

for x, y in proportion.items():
    labels.append(x)
    sizes.append(y)
    
labels = [x.upper() for x in labels]    
  
    
# Plot
fig = px.bar(x=labels, y=sizes)
fig.show()

One can see that the proportion is very equal, almost 50/50.

# How's the proportion of parental level of education?

In [18]:
df['parental level of education'].value_counts('parental level of education')

some college          0.226
associate's degree    0.222
high school           0.196
some high school      0.179
bachelor's degree     0.118
master's degree       0.059
Name: parental level of education, dtype: float64

# Which kind of lunch do the students prefer?

In [19]:
df['lunch'].value_counts('lunch')

standard        0.645
free/reduced    0.355
Name: lunch, dtype: float64

One can see that most students opt for standart lunch.

# Race

In [20]:
df['race/ethnicity'].value_counts('race/ethnicity')

group C    0.319
group D    0.262
group B    0.190
group E    0.140
group A    0.089
Name: race/ethnicity, dtype: float64

In [21]:
# Inserting the values above in a python dictionary.
race2 = {}

groups = ['A', 'B', 'C', 'D', 'E']

for i in groups:
    
    race2[f'group_{i}'] = df[df['race/ethnicity']==f'group {i}']['race/ethnicity'].count()

print(race2)

{'group_A': 89, 'group_B': 190, 'group_C': 319, 'group_D': 262, 'group_E': 140}


In [22]:
races = list(race2.keys())
values = list(race2.values())
  
# Plot
fig = px.bar(x=races, y=values)
fig.show()