# Class 5. Intermediate Python & AI 

# Intro to ML with Python. Linear regression

### Table of contents

0. Normalization and functions
1. Build train test set
2. Linear regression model

In [232]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [233]:
df = pd.read_csv('../../datasets/exams_mod_cleaned.csv')

In [234]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0


In [235]:
df.shape

(963, 11)

# 0. Normalization, Functions and more

In [236]:
data = pd.get_dummies(
        df, columns=['gender', 'race/ethnicity', 
     'parental level of education', 'lunch', 'test preparation course'], 
drop_first=True)
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,67,67,63,1000,2022,17.0,1,0,0,0,0,0,1,0,0,0,1,0
1,59,60,50,1002,2022,17.0,1,0,0,0,1,0,0,0,1,0,0,1
2,77,78,68,1003,2022,17.0,1,1,0,0,0,0,1,0,0,0,1,1
3,78,73,68,1004,2022,17.0,1,0,0,0,1,0,0,0,0,0,1,0
4,63,77,76,1005,2022,17.0,0,0,0,1,0,0,1,0,0,0,1,1


### Build the normalize function

In [237]:
def normalize_column(df, column_name):
    
    # Get the minimum and maximum values of the column
    min_value = df[column_name].min()
    max_value = df[column_name].max()

    # Normalize the column values to be in the range 0 to 1
    df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
    
    return df

In [238]:
cols_to_norm = ['reading score', 'writing score']
data_norm = normalize_column(data, cols_to_norm)
data_norm[cols_to_norm].head()

Unnamed: 0,reading score,writing score
0,0.547945,0.519481
1,0.452055,0.350649
2,0.69863,0.584416
3,0.630137,0.584416
4,0.684932,0.688312


In [239]:
#From Martin Colleoni
def normalized_funct(X):
    Xmin= X.min()
    Xmax= X.max()
    return (X-Xmin)/(Xmax-Xmin)

In [240]:
normalized_funct(data['reading score']).head()

0    0.547945
1    0.452055
2    0.698630
3    0.630137
4    0.684932
Name: reading score, dtype: float64

In [241]:
normalized_funct(data[cols_to_norm]).head()

Unnamed: 0,reading score,writing score
0,0.547945,0.519481
1,0.452055,0.350649
2,0.69863,0.584416
3,0.630137,0.584416
4,0.684932,0.688312


In [242]:
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,67,0.547945,0.519481,1000,2022,17.0,1,0,0,0,0,0,1,0,0,0,1,0
1,59,0.452055,0.350649,1002,2022,17.0,1,0,0,0,1,0,0,0,1,0,0,1
2,77,0.69863,0.584416,1003,2022,17.0,1,1,0,0,0,0,1,0,0,0,1,1
3,78,0.630137,0.584416,1004,2022,17.0,1,0,0,0,1,0,0,0,0,0,1,0
4,63,0.684932,0.688312,1005,2022,17.0,0,0,0,1,0,0,1,0,0,0,1,1


In [243]:
#import sikit
from sklearn.preprocessing import MinMaxScaler

In [244]:
data[['read_minmax', 'write_minmax']] = MinMaxScaler().fit_transform(df[cols_to_norm])

In [245]:
data[['reading score', 'writing score', 'read_minmax', 'write_minmax']].head()

Unnamed: 0,reading score,writing score,read_minmax,write_minmax
0,0.547945,0.519481,0.547945,0.519481
1,0.452055,0.350649,0.452055,0.350649
2,0.69863,0.584416,0.69863,0.584416
3,0.630137,0.584416,0.630137,0.584416
4,0.684932,0.688312,0.684932,0.688312


### Checking whether they are equal

In [246]:
data = data.round(5)

In [247]:
data['reading score'].equals(data['read_minmax'])

True

In [248]:
(data['reading score'].values == data['read_minmax'].values).all()

True

### Changing one value to show that it fails in this case

In [249]:
data.loc[0, 'read_minmax'] = 0.8

In [250]:
data.head(1)

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none,read_minmax,write_minmax
0,67,0.54795,0.51948,1000,2022,17.0,1,0,0,0,0,0,1,0,0,0,1,0,0.8,0.51948


In [251]:
data['reading score'].equals(data['read_minmax'])

False

In [252]:
(data['reading score'] == data['read_minmax']).all()

False

In [254]:
data.drop(['read_minmax', 'write_minmax'], axis=1, inplace=True)

### Functions 

In [255]:
def deduplicate_data(df):
    """
    Deduplicate my data
    Params: 
        df: pandas data frame
    Ouputs: 
        deduplicated data
    """
    
    return df.drop_duplicates(inplace=True)


def remove_irrelevant_data(df):
    """
    Function to remove data we don't want
    Params: 
        df: input dataframe
    Output: 
        red_df: reduced dataframe 
    """
    
    red_df = df[df['age'] > 20]
    
    return red_df


def fix_structural_errors(df):
    """
    Fixing misspellings in City and Gender
    Params: 
        df: input dataframe
    Outputs:
        fixed_data: dataframe with errors corrected in columns
    """
    
    fix_data_1 = correct_city(df)
    fixed_data = correct_gender(fix_data_1)
    
    return fixed_data    


def save_outputs(df):
    """
    Save dataframe
    Params: 
        df: input dataframe
    """
    
    df.to_csv('name.tsv', sep='\t', index=None)


def clean_data(data):
    """
    Funtion to clean your data
    Params:
        data: pandas dataframe with all the info
    Outputs: 
        data_cleaned: pandas dataframe after EDA and cleanup
    """
    
    no_dups = deduplicate_data(data)
    
    irr_data = remove_irrelevant_data(no_dups)
    
    fixed_errors = fix_structural_errors
    
    save_outputs(fixed_errors)

## 1. Preparing data for model

In [256]:
data.shape

(963, 18)

In [257]:
for col in data.columns:
    print(col, data[col].nunique())

math score 77
reading score 73
writing score 57
id_student 954
Year 1
Age 1
gender_male 2
race/ethnicity_group B 2
race/ethnicity_group C 2
race/ethnicity_group D 2
race/ethnicity_group E 2
parental level of education_bachelor's degree 2
parental level of education_high school 2
parental level of education_master's degree 2
parental level of education_some college 2
parental level of education_some high school 2
lunch_standard 2
test preparation course_none 2


In [258]:
data.drop(['Year', 'Age', 'id_student'], axis=1, inplace=True)

In [259]:
data.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,67,0.54795,0.51948,1,0,0,0,0,0,1,0,0,0,1,0
1,59,0.45205,0.35065,1,0,0,0,1,0,0,0,1,0,0,1
2,77,0.69863,0.58442,1,1,0,0,0,0,1,0,0,0,1,1
3,78,0.63014,0.58442,1,0,0,0,1,0,0,0,0,0,1,0
4,63,0.68493,0.68831,0,0,0,1,0,0,1,0,0,0,1,1


# 2. Build train test set 

In [260]:
def get_training_test(df):
    """
    
    """

    return train_test_split(df, test_size=0.1, random_state=0)

In [261]:
def build_train_test(df):
    """
    
    """

    train, test = get_training_test(df)

    trainY = train['math score'].values
    testY = test['math score'].values
    
    trainX = train.drop(['math score'], axis=1).values
    testX = test.drop(['math score'], axis=1).values

    return trainX, trainY, testX, testY, \
        train.drop(['math score'], axis=1).columns

In [262]:
trainX, trainY, testX, testY, feature_names = build_train_test(data)

In [277]:
#load train test split from scikit learn
from sklearn.model_selection import train_test_split

In [278]:
train, test = train_test_split(df, test_size=0.1, random_state=0)

In [279]:
train.shape

(866, 11)

In [280]:
test.shape

(97, 11)

#### I will always have a train and a test set. 
#### Also, I have the features that correspond to all the data and the labels which is the column I want to predict. In this case, the math score column

In [None]:
#Separate the labels
trainY = train['math score'].values
testY = test['math score'].values

#Get all features but the labels
trainX = train.drop(['math score'], axis=1).values
testX = test.drop(['math score'], axis=1).values

## 3. Linear regression model

In [265]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [272]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The mean absolute error
print("MAE: %.2f" % mean_absolute_error(testY, y_pred))
print("MAPE: {} %".format(round(mean_absolute_percentage_error(testY, y_pred) * 100, 2)))

MAE: 4.41
MAPE: 7.39 %


In [44]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(trainX, trainY)

# Make predictions using the testing set
y_pred = regr.predict(testX)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(testY, y_pred))
# The mean absolute error
print("Mean absolute error: %.2f" % mean_absolute_error(testY, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(testY, y_pred))

Coefficients: 
 [ 0.51661399  0.31990101 11.60346916 -0.22331577 -0.64778387 -0.41780691
  4.90270954 -0.56951556 -0.02100937 -0.54590945 -1.10842885 -0.95935889
  4.24818723  2.7866446 ]
Mean squared error: 31.33
Mean absolute error: 4.41
Coefficient of determination: 0.86
