# Understanding the data

In [6]:
import pandas as pd

df = pd.read_csv("./datasets/StudentsPerformance.csv")
print(df.info())
print(df.isnull().sum())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
categorical_columns = ['gender', 'race/ethnicity', 'lunch', 'test preparation course']
numerical_columns = ["math score", "reading score", "writing score"]

# since parental level of education doesnt matter, we will drop the column
df = df.drop("parental level of education",axis=1)
df.head()

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score
0,female,group B,standard,none,72,72,74
1,female,group C,standard,completed,69,90,88
2,female,group B,standard,none,90,95,93
3,male,group A,free/reduced,none,47,57,44
4,male,group C,standard,none,76,78,75


In [8]:
# data is fairly balanced
print(df.gender.value_counts())
print(df['race/ethnicity'].value_counts())
print(df.lunch.value_counts())
print(df['test preparation course'].value_counts())

gender
female    518
male      482
Name: count, dtype: int64
race/ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64
lunch
standard        645
free/reduced    355
Name: count, dtype: int64
test preparation course
none         642
completed    358
Name: count, dtype: int64


# Deciding what to do
we have a few options for building models
- classification model that groups students into advanced, average and slow learners based on these features
- Regression model predicting the score
- Prediction model finding if the student will take a test prep course based on their input features

# Classification model that groups students into advanced, average and slow learners
todo: 
- create new columns defining a student's total score, average score and performance measure
- convert categorical data to numeric
- train the model and predict

In [9]:
# creating new columns
df['total score'] = df[['math score', 'reading score', 'writing score']].sum(axis=1)
df['average score'] = df['total score']/3

In [10]:
def categorize(score):
    if score >= 80:
        return 'Advanced'
    elif score >= 40:
        return 'Average'
    else:
        return 'Slow'

In [11]:
df['performance'] = df['average score'].apply(categorize)
df.head()

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,average score,performance
0,female,group B,standard,none,72,72,74,218,72.666667,Average
1,female,group C,standard,completed,69,90,88,247,82.333333,Advanced
2,female,group B,standard,none,90,95,93,278,92.666667,Advanced
3,male,group A,free/reduced,none,47,57,44,148,49.333333,Average
4,male,group C,standard,none,76,78,75,229,76.333333,Average


In [12]:
mod_df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
mod_df.head()

Unnamed: 0,math score,reading score,writing score,total score,average score,performance,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_standard,test preparation course_none
0,72,72,74,218,72.666667,Average,False,True,False,False,False,True,True
1,69,90,88,247,82.333333,Advanced,False,False,True,False,False,True,False
2,90,95,93,278,92.666667,Advanced,False,True,False,False,False,True,True
3,47,57,44,148,49.333333,Average,True,False,False,False,False,False,True
4,76,78,75,229,76.333333,Average,True,False,True,False,False,True,True


## Choosing model algorithm
ill go with random forest classifier as it needs minimum data preprocessing for now and gets good results

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_absolute_error

X = mod_df.drop(columns=['performance', 'total score', 'average score'])
y = mod_df['performance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# prediction
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Advanced       0.98      0.92      0.95        53
     Average       0.98      1.00      0.99       232
        Slow       1.00      0.93      0.97        15

    accuracy                           0.98       300
   macro avg       0.99      0.95      0.97       300
weighted avg       0.98      0.98      0.98       300



# taking custom inputs

In [33]:
import pandas as pd
custom = {'math score': 86, 'reading score': 88, 'writing score': 90, 
          'gender': "male", 'race/ethnicity': "group B", 
          'lunch': 'standard', 'test preparation course': 'completed'}
cuzz = pd.DataFrame([custom])
cuzz

Unnamed: 0,math score,reading score,writing score,gender,race/ethnicity,lunch,test preparation course
0,86,88,90,male,group B,standard,completed


In [28]:
X_test.columns

Index(['math score', 'reading score', 'writing score', 'gender_male',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E', 'lunch_standard',
       'test preparation course_none'],
      dtype='object')

In [36]:
def encode_inp(df):
    if df['gender'] == 'male':
        df['gender_male'] = True
    else: 
        df['gender_male'] = False

    if df['race/ethnicity'] == 'group B':
        df['race/ethnicity_group B'] = True
    else:
        df['race/ethnicity_group B'] = False
    if df['race/ethnicity'] == 'group C':
        df['race/ethnicity_group C'] = True
    else:
        df['race/ethnicity_group C'] = False
    if df['race/ethnicity'] == 'group D':
        df['race/ethnicity_group D'] = True 
    else:
        df['race/ethnicity_group D'] = False
    if df['race/ethnicity'] == 'group D':
        df['race/ethnicity_group E'] = True
    else:
         df['race/ethnicity_group E'] = False
    if df['lunch'] == 'standard':
        df['lunch_standard'] = True
    else:
        df['lunch_standard'] = False
    if df['test preparation course'] == 'completed':
        df['test preparation course_none'] = False
    else:
        df['test preparation course_none'] = True
    return df

In [41]:
encode_inp(cuzz)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().