## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
data = pd.read_csv(r'../data/StudentsPerformance.csv')

In [3]:
data

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
0,Female,Group B,Bachelors Degree,Standard,Not Completed,72,72,74
1,Female,Group C,College,Standard,Completed,69,90,88
2,Female,Group B,Masters Degree,Standard,Not Completed,90,95,93
3,Male,Group A,Associate Degree,Free/Reduced,Not Completed,47,57,44
4,Male,Group C,College,Standard,Not Completed,76,78,75
...,...,...,...,...,...,...,...,...
995,Female,Group E,Masters Degree,Standard,Completed,88,99,95
996,Male,Group C,High School,Free/Reduced,Not Completed,62,55,55
997,Female,Group C,High School,Free/Reduced,Completed,59,71,65
998,Female,Group D,College,Standard,Completed,68,78,77


## Extracting basic information from the dataset

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Gender                       1000 non-null   object
 1   Race                         1000 non-null   object
 2   Parental_Level_of_Education  1000 non-null   object
 3   Lunch                        1000 non-null   object
 4   Test_Preparation             1000 non-null   object
 5   Math_Score                   1000 non-null   int64 
 6   Reading_Score                1000 non-null   int64 
 7   Writing_Score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [5]:
data.shape

(1000, 8)

In [6]:
data.size

8000

In [7]:
data.columns

Index(['Gender', 'Race', 'Parental_Level_of_Education', 'Lunch',
       'Test_Preparation', 'Math_Score', 'Reading_Score', 'Writing_Score'],
      dtype='object')

In [8]:
data.isna().sum()

Gender                         0
Race                           0
Parental_Level_of_Education    0
Lunch                          0
Test_Preparation               0
Math_Score                     0
Reading_Score                  0
Writing_Score                  0
dtype: int64

In [9]:
data.describe(include='all')

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
count,1000,1000,1000,1000,1000,1000.0,1000.0,1000.0
unique,2,5,5,2,2,,,
top,Female,Group C,High School,Standard,Not Completed,,,
freq,518,319,375,645,642,,,
mean,,,,,,66.089,69.169,68.054
std,,,,,,15.16308,14.600192,15.195657
min,,,,,,0.0,17.0,10.0
25%,,,,,,57.0,59.0,57.75
50%,,,,,,66.0,70.0,69.0
75%,,,,,,77.0,79.0,79.0


In [10]:
data.dtypes

Gender                         object
Race                           object
Parental_Level_of_Education    object
Lunch                          object
Test_Preparation               object
Math_Score                      int64
Reading_Score                   int64
Writing_Score                   int64
dtype: object

## Renaming every column

In [11]:
data.rename(columns={
    'gender':'Gender',
    'race/ethnicity':'Race',
    'parental level of education':'Parental_Level_of_Education',
    'lunch':'Lunch',
    'test preparation course':'Test_Preparation',
    'math score':'Math_Score',
    'reading score':'Reading_Score',
    'writing score':'Writing_Score'
}, inplace=True)

In [12]:
data.head()

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
0,Female,Group B,Bachelors Degree,Standard,Not Completed,72,72,74
1,Female,Group C,College,Standard,Completed,69,90,88
2,Female,Group B,Masters Degree,Standard,Not Completed,90,95,93
3,Male,Group A,Associate Degree,Free/Reduced,Not Completed,47,57,44
4,Male,Group C,College,Standard,Not Completed,76,78,75


## Working with Gender column

In [13]:
data['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [14]:
data['Gender'] = data['Gender'].str.title()

In [15]:
data['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [16]:
data.head()

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
0,Female,Group B,Bachelors Degree,Standard,Not Completed,72,72,74
1,Female,Group C,College,Standard,Completed,69,90,88
2,Female,Group B,Masters Degree,Standard,Not Completed,90,95,93
3,Male,Group A,Associate Degree,Free/Reduced,Not Completed,47,57,44
4,Male,Group C,College,Standard,Not Completed,76,78,75


## Working with Race column

In [17]:
data['Race'].unique()

array(['Group B', 'Group C', 'Group A', 'Group D', 'Group E'],
      dtype=object)

In [18]:
data['Race'] = data['Race'].str.title()

In [19]:
data['Race'].unique()

array(['Group B', 'Group C', 'Group A', 'Group D', 'Group E'],
      dtype=object)

Working with Parental_Level_of_Education column

In [20]:
data['Parental_Level_of_Education'].unique()

array(['Bachelors Degree', 'College', 'Masters Degree',
       'Associate Degree', 'High School'], dtype=object)

In [21]:
data["Parental_Level_of_Education"] = (
    data["Parental_Level_of_Education"].replace({
        "some college": "college",
        "bachelor's degree": "bachelors degree",
        "master's degree": "masters degree",
        "associate's degree": "associate degree",
        "some high school": "high school"
    }))

In [22]:
data['Parental_Level_of_Education'] = data['Parental_Level_of_Education'].str.title()

In [23]:
data.head()

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
0,Female,Group B,Bachelors Degree,Standard,Not Completed,72,72,74
1,Female,Group C,College,Standard,Completed,69,90,88
2,Female,Group B,Masters Degree,Standard,Not Completed,90,95,93
3,Male,Group A,Associate Degree,Free/Reduced,Not Completed,47,57,44
4,Male,Group C,College,Standard,Not Completed,76,78,75


## Working with Lunch column

In [24]:
data['Lunch'].unique()

array(['Standard', 'Free/Reduced'], dtype=object)

In [25]:
data['Lunch'] = data['Lunch'].str.title()

In [26]:
data.head()

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
0,Female,Group B,Bachelors Degree,Standard,Not Completed,72,72,74
1,Female,Group C,College,Standard,Completed,69,90,88
2,Female,Group B,Masters Degree,Standard,Not Completed,90,95,93
3,Male,Group A,Associate Degree,Free/Reduced,Not Completed,47,57,44
4,Male,Group C,College,Standard,Not Completed,76,78,75


## Working with Test_Preparation column

In [27]:
data['Test_Preparation'].unique()

array(['Not Completed', 'Completed'], dtype=object)

In [28]:
data['Test_Preparation'] = data['Test_Preparation'].str.title()

In [29]:
data['Test_Preparation'] = data['Test_Preparation'].fillna('Not Completed')

## Seeing the final data

In [30]:
data.head()

Unnamed: 0,Gender,Race,Parental_Level_of_Education,Lunch,Test_Preparation,Math_Score,Reading_Score,Writing_Score
0,Female,Group B,Bachelors Degree,Standard,Not Completed,72,72,74
1,Female,Group C,College,Standard,Completed,69,90,88
2,Female,Group B,Masters Degree,Standard,Not Completed,90,95,93
3,Male,Group A,Associate Degree,Free/Reduced,Not Completed,47,57,44
4,Male,Group C,College,Standard,Not Completed,76,78,75


## Saving the file

In [1]:
data.to_csv(r'StudentsPerformance.csv', index=False)

NameError: name 'data' is not defined