# Class 5. Intermediate Python & AI 

# Intro to ML with Python. Linear regression

### Table of contents

0. Normalization and functions
1. Build train test set
2. Linear regression model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv('../../datasets/exams_mod_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age
0,male,group A,high school,standard,completed,67,67,63,1000,2023,14.0
1,female,group D,high school,free/reduced,none,40,29,55,1001,2023,17.0
2,male,group E,some college,free/reduced,none,59,60,50,1002,2023,14.0
3,male,group B,high school,standard,none,77,78,68,1003,2023,17.0
4,male,group E,associate's degree,standard,completed,78,73,68,1004,2023,16.0


In [4]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'id_student', 'Year', 'Age'],
      dtype='object')

In [5]:
df.shape

(951, 11)

# 0. Normalization, Functions and more

In [6]:
data = pd.get_dummies(
        df, columns=['gender', 'race/ethnicity', 
     'parental level of education', 'lunch', 'test preparation course'], 
drop_first=True)
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
0,67,67,63,1000,2023,14.0,True,False,False,False,False,False,True,False,False,True,False
1,40,29,55,1001,2023,17.0,False,False,False,True,False,False,True,False,False,False,True
2,59,60,50,1002,2023,14.0,True,False,False,False,True,False,False,False,True,False,True
3,77,78,68,1003,2023,17.0,True,True,False,False,False,False,True,False,False,True,True
4,78,73,68,1004,2023,16.0,True,False,False,False,True,False,False,False,False,True,False


In [7]:
data.columns

Index(['math score', 'reading score', 'writing score', 'id_student', 'Year',
       'Age', 'gender_male', 'race/ethnicity_group B',
       'race/ethnicity_group C', 'race/ethnicity_group D',
       'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college', 'lunch_standard',
       'test preparation course_none'],
      dtype='object')

### Build the normalize function

In [8]:
def normalize_column(df, column_name):
    
    # Get the minimum and maximum values of the column
    min_value = df[column_name].min()
    max_value = df[column_name].max()

    # Normalize the column values to be in the range 0 to 1
    df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
    
    return df

In [9]:
cols_to_norm = ['reading score', 'writing score']
data_norm = normalize_column(data, cols_to_norm)
data_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
0,0.611765,0.519481
1,0.164706,0.415584
2,0.529412,0.350649
3,0.741176,0.584416
4,0.682353,0.584416


In [10]:
def normalized_funct(X):
    Xmin= X.min()
    Xmax= X.max()
    return (X-Xmin)/(Xmax-Xmin)

In [11]:
normalized_funct(data['reading score']).head()

0    0.611765
1    0.164706
2    0.529412
3    0.741176
4    0.682353
Name: reading score, dtype: float64

In [12]:
normalized_funct(data[cols_to_norm]).head()

Unnamed: 0,reading score,writing score
0,0.611765,0.519481
1,0.164706,0.415584
2,0.529412,0.350649
3,0.741176,0.584416
4,0.682353,0.584416


In [13]:
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
0,67,0.611765,0.519481,1000,2023,14.0,True,False,False,False,False,False,True,False,False,True,False
1,40,0.164706,0.415584,1001,2023,17.0,False,False,False,True,False,False,True,False,False,False,True
2,59,0.529412,0.350649,1002,2023,14.0,True,False,False,False,True,False,False,False,True,False,True
3,77,0.741176,0.584416,1003,2023,17.0,True,True,False,False,False,False,True,False,False,True,True
4,78,0.682353,0.584416,1004,2023,16.0,True,False,False,False,True,False,False,False,False,True,False


In [14]:
#import sikit
from sklearn.preprocessing import MinMaxScaler

In [15]:
data[['read_minmax', 'write_minmax']] = MinMaxScaler().fit_transform(df[cols_to_norm])

In [16]:
data[['reading score', 'writing score', 'read_minmax', 'write_minmax']].head()

Unnamed: 0,reading score,writing score,read_minmax,write_minmax
0,0.611765,0.519481,0.611765,0.519481
1,0.164706,0.415584,0.164706,0.415584
2,0.529412,0.350649,0.529412,0.350649
3,0.741176,0.584416,0.741176,0.584416
4,0.682353,0.584416,0.682353,0.584416


### Checking whether they are equal

In [17]:
data = data.round(5)

In [18]:
data['reading score'].equals(data['read_minmax'])

True

In [19]:
(data['reading score'].values == data['read_minmax'].values).all()

True

### Changing one value to show that it fails in this case

In [20]:
data.loc[0, 'read_minmax'] = 0.8

In [21]:
data.head(1)

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none,read_minmax,write_minmax
0,67,0.61176,0.51948,1000,2023,14.0,True,False,False,False,False,False,True,False,False,True,False,0.8,0.51948


In [22]:
data['reading score'].equals(data['read_minmax'])

False

In [23]:
(data['reading score'] == data['read_minmax']).all()

False

In [24]:
data.drop(['read_minmax', 'write_minmax'], axis=1, inplace=True)

### Functions 

In [25]:
def a_function(my_in_1, my_in_2):
    print('This is my input 1: {}'.format(my_in_1))
    print('This is my input 2: {}'.format(my_in_2))    

In [26]:
a_function(1,2)

This is my input 1: 1
This is my input 2: 2


In [27]:
def a_function_2(my_in_1, my_in_2, my_in_3):
    if my_in_1 == 1:
        return 'it is a one'
    else:
        return my_in_2

In [28]:
print(a_function_2(2,5,3))

5


In [29]:
def deduplicate_data(df):
    """
    Deduplicate my data
    Params: 
        df: pandas data frame
    Ouputs: 
        deduplicated data
    """
    
    return df.drop_duplicates(inplace=True)


def remove_irrelevant_data(df):
    """
    Function to remove data we don't want
    Params: 
        df: input dataframe
    Output: 
        red_df: reduced dataframe 
    """
    
    red_df = df[df['age'] > 20]
    
    return red_df


def fix_structural_errors(df):
    """
    Fixing misspellings in City and Gender
    Params: 
        df: input dataframe
    Outputs:
        fixed_data: dataframe with errors corrected in columns
    """
    
    fix_data_1 = correct_city(df)
    fixed_data = correct_gender(fix_data_1)
    
    return fixed_data    


def save_outputs(df):
    """
    Save dataframe
    Params: 
        df: input dataframe
    """
    
    df.to_csv('name.tsv', sep='\t', index=None)


def clean_data(data):
    """
    Funtion to clean your data
    Params:
        data: pandas dataframe with all the info
    Outputs: 
        data_cleaned: pandas dataframe after EDA and cleanup
    """
    
    no_dups = deduplicate_data(data)
    
    irr_data = remove_irrelevant_data(no_dups)
    
    fixed_errors = fix_structural_errors(irr_data)
    
    save_outputs(fixed_errors)

## 1. Preparing data for model

In [30]:
data.shape

(951, 17)

In [31]:
for col in data.columns:
    print(col, data[col].nunique())

math score 77
reading score 85
writing score 57
id_student 941
Year 1
Age 5
gender_male 2
race/ethnicity_group B 2
race/ethnicity_group C 2
race/ethnicity_group D 2
race/ethnicity_group E 2
parental level of education_bachelor's degree 2
parental level of education_high school 2
parental level of education_master's degree 2
parental level of education_some college 2
lunch_standard 2
test preparation course_none 2


In [32]:
data.drop(['Year', 'id_student'], axis=1, inplace=True)


In [33]:
data.shape

(951, 15)

In [34]:
df = df.drop_duplicates()
df.shape

(941, 11)

# 2. Build train test set 

In [35]:
def get_training_test(df):
    """
    
    """

    return train_test_split(df, test_size=0.1, random_state=0)

In [36]:
def build_train_test(df):
    """
    
    """

    train, test = get_training_test(df)

    trainY = train['math score'].values
    testY = test['math score'].values
    
    trainX = train.drop(['math score'], axis=1).values
    testX = test.drop(['math score'], axis=1).values

    return trainX, trainY, testX, testY, \
        train.drop(['math score'], axis=1).columns

In [37]:
trainX, trainY, testX, testY, feature_names = build_train_test(data)

## Easier Option

In [38]:
#load train test split from scikit learn
from sklearn.model_selection import train_test_split

In [39]:
Y = data['math score']

In [40]:
X = data.drop('math score', axis=1)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

In [42]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((855, 14), (96, 14), (855,), (96,))

## Also Possible 

In [45]:
train, test = train_test_split(data, test_size=0.1, random_state=0)

In [46]:
train.shape

(855, 15)

In [47]:
test.shape

(96, 15)

In [48]:
data.columns

Index(['math score', 'reading score', 'writing score', 'Age', 'gender_male',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college', 'lunch_standard',
       'test preparation course_none'],
      dtype='object')

#### I will always have a train and a test set. 
#### Also, I have the features that correspond to all the data and the labels which is the column I want to predict. In this case, the math score column

In [49]:
#Separate the labels
trainY = train['math score'].values
testY = test['math score'].values

#Get all features but the labels
trainX = train.drop(['math score'], axis=1).values
testX = test.drop(['math score'], axis=1).values

## 3. Linear regression model

In [50]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

In [51]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The mean absolute error
print("MAE: %.2f" % mean_absolute_error(testY, y_pred))
print("MAPE: {} %".format(round(mean_absolute_percentage_error(testY, y_pred) * 100, 2)))

MAE: 5.30
MAPE: 8.67 %


In [52]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(testY, y_pred))
# The mean absolute error
print("Mean absolute error: %.2f" % mean_absolute_error(testY, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(testY, y_pred))

Coefficients: 
 [13.76569065 45.10303377 -0.19966553 12.41325537 -0.60540543 -0.67618632
 -1.19467096  4.19207253 -1.02442725  1.05109989 -1.70207268 -1.77868682
  4.60584349  4.59557327]
Mean squared error: 47.89
Mean absolute error: 5.30
Coefficient of determination: 0.79


# Do normalization afterwards

In [53]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age
0,male,group A,high school,standard,completed,67,67,63,1000,2023,14.0
1,female,group D,high school,free/reduced,none,40,29,55,1001,2023,17.0
2,male,group E,some college,free/reduced,none,59,60,50,1002,2023,14.0
3,male,group B,high school,standard,none,77,78,68,1003,2023,17.0
4,male,group E,associate's degree,standard,completed,78,73,68,1004,2023,16.0


In [54]:
data = pd.get_dummies(
        df, columns=['gender', 'race/ethnicity', 
     'parental level of education', 'lunch', 'test preparation course'], 
drop_first=True)
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
0,67,67,63,1000,2023,14.0,True,False,False,False,False,False,True,False,False,True,False
1,40,29,55,1001,2023,17.0,False,False,False,True,False,False,True,False,False,False,True
2,59,60,50,1002,2023,14.0,True,False,False,False,True,False,False,False,True,False,True
3,77,78,68,1003,2023,17.0,True,True,False,False,False,False,True,False,False,True,True
4,78,73,68,1004,2023,16.0,True,False,False,False,True,False,False,False,False,True,False


In [55]:
data.drop(['Year', 'Age', 'id_student'], axis=1, inplace=True)

In [56]:
data.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
0,67,67,63,True,False,False,False,False,False,True,False,False,True,False
1,40,29,55,False,False,False,True,False,False,True,False,False,False,True
2,59,60,50,True,False,False,False,True,False,False,False,True,False,True
3,77,78,68,True,True,False,False,False,False,True,False,False,True,True
4,78,73,68,True,False,False,False,True,False,False,False,False,True,False


### 1st Do the train test split first

In [57]:
train, test = train_test_split(data, test_size=0.1, random_state=0)

In [58]:
train.shape, test.shape

((846, 14), (95, 14))

### 2nd Normalize the sets separately

In [59]:
cols_to_norm = ['reading score', 'writing score']
train_norm = normalize_column(train, cols_to_norm)
test_norm = normalize_column(test, cols_to_norm)
train_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
698,0.882353,1.0
345,0.658824,0.688312
346,0.447059,0.428571
905,0.505882,0.415584
34,0.588235,0.402597


In [60]:
test_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
571,0.731707,1.0
304,0.512195,0.38806
308,0.439024,0.149254
590,0.560976,0.313433
614,0.195122,0.61194


In [61]:
data_norm = normalize_column(data, cols_to_norm)
data_norm[cols_to_norm].iloc[14]

reading score    0.376471
writing score    0.246753
Name: 14, dtype: float64

In [62]:
test_norm.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
571,83,0.731707,1.0,True,False,True,False,False,False,True,False,False,False,False
304,58,0.512195,0.38806,True,False,False,True,False,False,False,False,True,False,False
308,56,0.439024,0.149254,True,False,True,False,False,False,False,False,True,True,True
590,58,0.560976,0.313433,True,True,False,False,False,True,False,False,False,False,True
614,60,0.195122,0.61194,False,False,False,True,False,False,True,False,False,True,True


In [63]:
train_norm.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,lunch_standard,test preparation course_none
698,85,0.882353,1.0,False,False,False,True,False,False,False,False,True,True,True
345,66,0.658824,0.688312,False,True,False,False,False,False,False,False,True,True,False
346,59,0.447059,0.428571,True,False,False,False,True,False,False,False,True,False,False
905,49,0.505882,0.415584,True,False,True,False,False,False,False,False,False,True,False
34,67,0.588235,0.402597,True,True,False,False,False,False,True,False,False,False,True


In [64]:
#Separate the labels
trainY = train_norm['math score'].values
testY = test_norm['math score'].values

#Get all features but the labels
trainX = train_norm.drop(['math score'], axis=1).values
testX = test_norm.drop(['math score'], axis=1).values

In [65]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The mean absolute error
print("MAE: %.2f" % mean_absolute_error(testY, y_pred))
print("MAPE: {} %".format(round(mean_absolute_percentage_error(testY, y_pred) * 100, 2)))

MAE: 5.99
MAPE: 9.0 %
