# Class 5. Intermediate Python & AI 

# Intro to ML with Python. Linear regression

### Table of contents

0. Normalization and functions
1. Build train test set
2. Linear regression model

In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [111]:
df = pd.read_csv('../../datasets/exams_mod_cleaned.csv')

In [112]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age
0,male,group A,high school,standard,completed,67,67,63,1000,2023,14.0
1,female,group D,high school,free/reduced,none,40,29,55,1001,2023,17.0
2,male,group E,some college,free/reduced,none,59,60,50,1002,2023,14.0
3,male,group B,high school,standard,none,77,78,68,1003,2023,17.0
4,male,group E,associate's degree,standard,completed,78,73,68,1004,2023,16.0


In [113]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'id_student', 'Year', 'Age'],
      dtype='object')

In [66]:
df.shape

(951, 11)

# 0. Normalization, Functions and more

In [67]:
data = pd.get_dummies(
        df, columns=['gender', 'race/ethnicity', 
     'parental level of education', 'lunch', 'test preparation course'], 
drop_first=True)
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
0,67,67,63,1000,2023,14.0,1,0,0,0,0,0,1,0,0,1,0
1,40,29,55,1001,2023,17.0,0,0,0,1,0,0,1,0,0,0,1
2,59,60,50,1002,2023,14.0,1,0,0,0,1,0,0,0,1,0,1
3,77,78,68,1003,2023,17.0,1,1,0,0,0,0,1,0,0,1,1
4,78,73,68,1004,2023,16.0,1,0,0,0,1,0,0,0,0,1,0


In [68]:
data.columns

Index(['math score', 'reading score', 'writing score', 'id_student', 'Year',
       'Age', 'gender_male', 'race/ethnicity_group B',
       'race/ethnicity_group C', 'race/ethnicity_group D',
       'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college', 'lunch_standard',
       'test preparation course_none'],
      dtype='object')

### Build the normalize function

In [69]:
def normalize_column(df, column_name):
    
    # Get the minimum and maximum values of the column
    min_value = df[column_name].min()
    max_value = df[column_name].max()

    # Normalize the column values to be in the range 0 to 1
    df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
    
    return df

In [70]:
cols_to_norm = ['reading score', 'writing score']
data_norm = normalize_column(data, cols_to_norm)
data_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
0,0.611765,0.519481
1,0.164706,0.415584
2,0.529412,0.350649
3,0.741176,0.584416
4,0.682353,0.584416


In [71]:
def normalized_funct(X):
    Xmin= X.min()
    Xmax= X.max()
    return (X-Xmin)/(Xmax-Xmin)

In [72]:
normalized_funct(data['reading score']).head()

0    0.611765
1    0.164706
2    0.529412
3    0.741176
4    0.682353
Name: reading score, dtype: float64

In [73]:
normalized_funct(data[cols_to_norm]).head()

Unnamed: 0,reading score,writing score
0,0.611765,0.519481
1,0.164706,0.415584
2,0.529412,0.350649
3,0.741176,0.584416
4,0.682353,0.584416


In [74]:
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
0,67,0.611765,0.519481,1000,2023,14.0,1,0,0,0,0,0,1,0,0,1,0
1,40,0.164706,0.415584,1001,2023,17.0,0,0,0,1,0,0,1,0,0,0,1
2,59,0.529412,0.350649,1002,2023,14.0,1,0,0,0,1,0,0,0,1,0,1
3,77,0.741176,0.584416,1003,2023,17.0,1,1,0,0,0,0,1,0,0,1,1
4,78,0.682353,0.584416,1004,2023,16.0,1,0,0,0,1,0,0,0,0,1,0


In [75]:
#import sikit
from sklearn.preprocessing import MinMaxScaler

In [76]:
data[['read_minmax', 'write_minmax']] = MinMaxScaler().fit_transform(df[cols_to_norm])

In [77]:
data[['reading score', 'writing score', 'read_minmax', 'write_minmax']].head()

Unnamed: 0,reading score,writing score,read_minmax,write_minmax
0,0.611765,0.519481,0.611765,0.519481
1,0.164706,0.415584,0.164706,0.415584
2,0.529412,0.350649,0.529412,0.350649
3,0.741176,0.584416,0.741176,0.584416
4,0.682353,0.584416,0.682353,0.584416


### Checking whether they are equal

In [78]:
data = data.round(5)

In [79]:
data['reading score'].equals(data['read_minmax'])

True

In [80]:
(data['reading score'].values == data['read_minmax'].values).all()

True

### Changing one value to show that it fails in this case

In [81]:
data.loc[0, 'read_minmax'] = 0.8

In [82]:
data.head(1)

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none,read_minmax,write_minmax
0,67,0.61176,0.51948,1000,2023,14.0,1,0,0,0,0,0,1,0,0,1,0,0.8,0.51948


In [83]:
data['reading score'].equals(data['read_minmax'])

False

In [84]:
(data['reading score'] == data['read_minmax']).all()

False

In [85]:
data.drop(['read_minmax', 'write_minmax'], axis=1, inplace=True)

### Functions 

In [86]:
def a_function(my_in_1, my_in_2):
    print('This is my input 1: {}'.format(my_in_1))
    print('This is my input 2: {}'.format(my_in_2))    

In [87]:
a_function(1,2)

This is my input 1: 1
This is my input 2: 2


In [88]:
def a_function_2(my_in_1, my_in_2, my_in_3):
    if my_in_1 == 1:
        return 'it is a one'
    else:
        return my_in_2

In [89]:
print(a_function_2(2,5,3))

5


In [90]:
def deduplicate_data(df):
    """
    Deduplicate my data
    Params: 
        df: pandas data frame
    Ouputs: 
        deduplicated data
    """
    
    return df.drop_duplicates(inplace=True)


def remove_irrelevant_data(df):
    """
    Function to remove data we don't want
    Params: 
        df: input dataframe
    Output: 
        red_df: reduced dataframe 
    """
    
    red_df = df[df['age'] > 20]
    
    return red_df


def fix_structural_errors(df):
    """
    Fixing misspellings in City and Gender
    Params: 
        df: input dataframe
    Outputs:
        fixed_data: dataframe with errors corrected in columns
    """
    
    fix_data_1 = correct_city(df)
    fixed_data = correct_gender(fix_data_1)
    
    return fixed_data    


def save_outputs(df):
    """
    Save dataframe
    Params: 
        df: input dataframe
    """
    
    df.to_csv('name.tsv', sep='\t', index=None)


def clean_data(data):
    """
    Funtion to clean your data
    Params:
        data: pandas dataframe with all the info
    Outputs: 
        data_cleaned: pandas dataframe after EDA and cleanup
    """
    
    no_dups = deduplicate_data(data)
    
    irr_data = remove_irrelevant_data(no_dups)
    
    fixed_errors = fix_structural_errors(irr_data)
    
    save_outputs(fixed_errors)

## 1. Preparing data for model

In [91]:
data.shape

(951, 17)

In [92]:
for col in data.columns:
    print(col, data[col].nunique())

math score 77
reading score 85
writing score 57
id_student 941
Year 1
Age 5
gender_male 2
race/ethnicity_group B 2
race/ethnicity_group C 2
race/ethnicity_group D 2
race/ethnicity_group E 2
parental level of education_bachelor's degree 2
parental level of education_high school 2
parental level of education_master's degree 2
parental level of education_some college 2
lunch_standard 2
test preparation course_none 2


In [93]:
data.drop(['Year', 'id_student'], axis=1, inplace=True)


In [94]:
data.shape

(951, 15)

In [95]:
df = df.drop_duplicates()
df.shape

(941, 11)

# 2. Build train test set 

In [96]:
def get_training_test(df):
    """
    
    """

    return train_test_split(df, test_size=0.1, random_state=0)

In [97]:
def build_train_test(df):
    """
    
    """

    train, test = get_training_test(df)

    trainY = train['math score'].values
    testY = test['math score'].values
    
    trainX = train.drop(['math score'], axis=1).values
    testX = test.drop(['math score'], axis=1).values

    return trainX, trainY, testX, testY, \
        train.drop(['math score'], axis=1).columns

In [98]:
trainX, trainY, testX, testY, feature_names = build_train_test(data)

In [101]:
#load train test split from scikit learn
from sklearn.model_selection import train_test_split

In [117]:
Y = data['math score']

In [118]:
X = data.drop('math score', axis=1)

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

In [124]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((855, 14), (96, 14), (855,), (96,))

In [100]:
train, test = train_test_split(data, test_size=0.1, random_state=0)

In [45]:
train.shape

(855, 14)

In [46]:
test.shape

(96, 14)

In [47]:
data.columns

Index(['math score', 'reading score', 'writing score', 'gender_male',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college', 'lunch_standard',
       'test preparation course_none'],
      dtype='object')

#### I will always have a train and a test set. 
#### Also, I have the features that correspond to all the data and the labels which is the column I want to predict. In this case, the math score column

In [48]:
#Separate the labels
trainY = train['math score'].values
testY = test['math score'].values

#Get all features but the labels
trainX = train.drop(['math score'], axis=1).values
testX = test.drop(['math score'], axis=1).values

## 3. Linear regression model

In [108]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

In [109]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The mean absolute error
print("MAE: %.2f" % mean_absolute_error(testY, y_pred))
print("MAPE: {} %".format(round(mean_absolute_percentage_error(testY, y_pred) * 100, 2)))

MAE: 5.30
MAPE: 8.67 %


In [110]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(testY, y_pred))
# The mean absolute error
print("Mean absolute error: %.2f" % mean_absolute_error(testY, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(testY, y_pred))

Coefficients: 
 [13.76569065 45.10303377 -0.19966553 12.41325537 -0.60540543 -0.67618632
 -1.19467096  4.19207253 -1.02442725  1.05109989 -1.70207268 -1.77868682
  4.60584349  4.59557327]
Mean squared error: 47.89
Mean absolute error: 5.30
Coefficient of determination: 0.79


# Do normalization afterwards

In [370]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0


In [371]:
data = pd.get_dummies(
        df, columns=['gender', 'race/ethnicity', 
     'parental level of education', 'lunch', 'test preparation course'], 
drop_first=True)
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,67,67,63,1000,2022,17.0,1,0,0,0,0,0,1,0,0,0,1,0
1,59,60,50,1002,2022,17.0,1,0,0,0,1,0,0,0,1,0,0,1
2,77,78,68,1003,2022,17.0,1,1,0,0,0,0,1,0,0,0,1,1
3,78,73,68,1004,2022,17.0,1,0,0,0,1,0,0,0,0,0,1,0
4,63,77,76,1005,2022,17.0,0,0,0,1,0,0,1,0,0,0,1,1


In [372]:
data.drop(['Year', 'Age', 'id_student'], axis=1, inplace=True)

In [373]:
data.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,67,67,63,1,0,0,0,0,0,1,0,0,0,1,0
1,59,60,50,1,0,0,0,1,0,0,0,1,0,0,1
2,77,78,68,1,1,0,0,0,0,1,0,0,0,1,1
3,78,73,68,1,0,0,0,1,0,0,0,0,0,1,0
4,63,77,76,0,0,0,1,0,0,1,0,0,0,1,1


### 1st Do the train test split first

In [125]:
train, test = train_test_split(data, test_size=0.1, random_state=0)

In [126]:
train.shape, test.shape

((855, 15), (96, 15))

### 2nd Normalize the sets separately

In [127]:
cols_to_norm = ['reading score', 'writing score']
train_norm = normalize_column(train, cols_to_norm)
test_norm = normalize_column(test, cols_to_norm)
train_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
489,0.71765,0.62338
262,0.76471,0.74026
8,0.48235,0.54545
776,0.56471,0.53247
523,0.36471,0.44156


In [128]:
test_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
873,0.636365,0.537311
568,0.16883,0.552239
249,0.16883,0.567168
331,0.480516,0.462689
492,0.571422,0.567168


In [129]:
data_norm = normalize_column(data, cols_to_norm)
data_norm[cols_to_norm].iloc[14]

reading score    0.37647
writing score    0.24675
Name: 14, dtype: float64

In [130]:
test_norm.head()

Unnamed: 0,math score,reading score,writing score,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
873,71,0.636365,0.537311,16.0,1,0,1,0,0,0,0,0,0,1,1
568,89,0.16883,0.552239,14.0,1,0,0,1,0,0,1,0,0,1,1
249,71,0.16883,0.567168,17.0,0,0,1,0,0,0,1,0,0,1,1
331,58,0.480516,0.462689,17.0,0,1,0,0,0,1,0,0,0,1,1
492,63,0.571422,0.567168,14.0,0,0,0,0,0,1,0,0,0,1,1


In [131]:
train_norm.head()

Unnamed: 0,math score,reading score,writing score,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
489,79,0.71765,0.62338,15.0,0,0,0,0,1,0,0,0,1,1,1
262,77,0.76471,0.74026,16.0,1,0,1,0,0,0,0,0,0,1,1
8,63,0.48235,0.54545,15.0,1,0,0,1,0,0,1,0,0,1,1
776,65,0.56471,0.53247,15.49663,1,0,0,1,0,0,1,0,0,1,1
523,40,0.36471,0.44156,17.0,0,0,0,1,0,0,1,0,0,1,1


In [132]:
#Separate the labels
trainY = train_norm['math score'].values
testY = test_norm['math score'].values

#Get all features but the labels
trainX = train_norm.drop(['math score'], axis=1).values
testX = test_norm.drop(['math score'], axis=1).values

In [133]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The mean absolute error
print("MAE: %.2f" % mean_absolute_error(testY, y_pred))
print("MAPE: {} %".format(round(mean_absolute_percentage_error(testY, y_pred) * 100, 2)))

MAE: 6.26
MAPE: 9.82 %
