# Import libraries and the dataset

In [7]:
import pandas as pd

In [45]:
students = pd.read_csv('https://raw.githubusercontent.com/luiseduardogfranca/students-performance/master/StudentsPerformance.csv')
students.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Mission 1
Split the dataset into a training and a testing set. Allocate 80% of data for training and 20% for testing. Use 'gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course' as model inputs, and 'math score' as a predicted value. You should end up with 4 arrays: training set, training set's predicted values, testing set, testing set's predicted values.

In [9]:
from sklearn.model_selection import train_test_split

columns = ["gender",
           "race/ethnicity",
           "parental level of education",
           "lunch",
           "test preparation course",]

X = pd.DataFrame(students[columns], copy=True)
y = pd.Series(students["math score"], copy=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42
                                                    )

In [47]:
X.shape

(1000, 5)

# Mission 2
Your task is to encode all categorical features into numerical ones, to make sure the dataset is prepared for a numerical model. You might want to use different encoding approaches to different features based on their meaning and your intuition.

**Answer**:
1. `gender` and `lunch` can be encoded using default _OrdinalEncoder_
2. `parental level of education` also needs to be specified for _OrdinalEncoder_
3. `test preparation course` category order needs to be specified for _OrdinalEncoder_ (completed = 1, none = 0)
4. `race/ethnicity` will be encoded by _OneHotEncoder_. Also, it's weird why would anyone track someone's race/ethnicity in academic environment.

In [69]:
# Getting unique values for each column
[print(f"Unique values in column [{s}]: {X[s].unique()}") for s in X.columns]
print()

Unique values in column [gender]: ['female' 'male']
Unique values in column [race/ethnicity]: ['group B' 'group C' 'group A' 'group D' 'group E']
Unique values in column [parental level of education]: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Unique values in column [lunch]: ['standard' 'free/reduced']
Unique values in column [test preparation course]: ['none' 'completed']



In [71]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Creating column transformer with all columns in identical order
ct = make_column_transformer(
    
    # Simple "gender" encoding
    (OrdinalEncoder(), ["gender"]),

    # One-hot encoding groups (race/ethnicity)
    # order is specified (to be robust against missing categories in test data)
    (OneHotEncoder(
        categories=[["group A",
                     "group B",
                     "group C",
                     "group D",
                     "group E"]]),
     ["race/ethnicity"]),
    
    # Ordinal encoding of parental education level,
    # order is important and specified
    (OrdinalEncoder(
        categories=[["some high school",
                     "high school",
                     "associate's degree",
                     "some college",
                     "bachelor's degree",
                     "master's degree"]]),
     ["parental level of education"]),

    # Simple "lunch" encoding
    (OrdinalEncoder(),
     ["lunch"]),
    
    # Specific ordinal encoding,
    # none = 0, completed = 1
    (OrdinalEncoder(
        categories=[["none",
                     "completed"]]),
     ["test preparation course"]),

    # All the rest of the columns will pass through 
    remainder="passthrough"
)

# Fitting and transforming on the entire set to get all the categories
ct.fit(X)

# Features names collected from the column transformer
features = ([X.columns[0]]
            + list(ct.named_transformers_.onehotencoder.get_feature_names())
            + list(X.columns[2:])
)

# Transforming the train and test sets using the fitted column transformer
X_train = pd.DataFrame(ct.transform(X_train), columns=features)
X_test = pd.DataFrame(ct.transform(X_test), columns=features)


In [72]:
X_train.shape

(800, 9)

In [73]:
X_test.shape

(200, 9)

In [74]:
X_train

Unnamed: 0,gender,x0_group A,x0_group B,x0_group C,x0_group D,x0_group E,parental level of education,lunch,test preparation course
0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
795,0.0,0.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0
796,1.0,0.0,0.0,1.0,0.0,0.0,4.0,1.0,0.0
797,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0
798,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0


In [75]:
X_test



Unnamed: 0,gender,x0_group A,x0_group B,x0_group C,x0_group D,x0_group E,parental level of education,lunch,test preparation course
0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
196,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0
197,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0
198,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0
