# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error

# Defining Helper Functions

In [3]:
def load_df(file):
    """Loads the Dataset"""
    df = pd.read_csv(file, delimiter=';')
    return df

In [4]:
def find_categorical_columns(df):
    """Returns name of Columns with string values"""
    categorical_columns = []
    for col in df.columns:
        if type(df[col][0]) == str:
            categorical_columns.append(col)
    return categorical_columns 
    

In [5]:
def encode_columns(df, columns:list):
    """Apply Encoding to columns"""
    encoder = LabelEncoder()
    df1 = df.copy()
    for column in columns:
        df1[column] = encoder.fit_transform(df1[column])
    return df1

# Driver Code


Loading file

In [6]:
file = 'student-mat.csv'
df = load_df(file)
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


Encoding the Data

In [7]:
categorical_columns = find_categorical_columns(df)
df = encode_columns(df, categorical_columns)
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,...,4,3,4,1,1,3,6,5,6,6
1,0,0,17,1,0,1,1,1,0,2,...,5,3,3,1,1,3,4,5,5,6
2,0,0,15,1,1,1,1,1,0,2,...,4,3,2,2,3,3,10,7,8,10
3,0,0,15,1,0,1,4,2,1,3,...,3,2,2,1,1,5,2,15,14,15
4,0,0,16,1,0,1,3,3,2,2,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,1,20,1,1,0,2,2,3,3,...,5,5,4,4,5,4,11,9,9,9
391,1,1,17,1,1,1,3,1,3,3,...,2,4,5,3,4,2,3,14,16,16
392,1,1,21,0,0,1,1,1,2,2,...,5,5,3,3,3,3,3,10,8,7
393,1,1,18,0,1,1,3,2,3,2,...,4,4,1,3,4,5,0,11,12,10


Extracting Input and Target Column

In [8]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]

Splitting Dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42)

Training Model

In [10]:
model = LinearRegression()
model.fit(X_train,y_train)

In [15]:
model.coef_ # weight

array([[ 0.16181058,  0.25146378, -0.13645799,  0.01570499, -0.1064061 ,
        -0.10176453,  0.07035222, -0.15296978,  0.0357705 , -0.12866133,
         0.03455575,  0.1535349 ,  0.21791392, -0.13138685, -0.27923361,
         0.77716049,  0.16716981,  0.17836271, -0.55595309, -0.2487881 ,
         0.27387634, -0.15505009, -0.33890381,  0.33054875, -0.04316192,
         0.15519683, -0.18907128,  0.05955517,  0.0378789 ,  0.04014612,
         0.23054961,  0.94327083]])

In [17]:
model.intercept_  # bias

array([-1.40434233])

Making Predictions

In [13]:
predictions = model.predict(X_test)

MSE

In [12]:
print('MSE : ', mean_squared_error(y_test, predictions))

MSE :  4.511360147828821


# Trying Regularisation Methods

In [14]:
from sklearn import linear_model

In [None]:
reg = linear_model.Ridge(alpha=.5)
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])

reg.coef_

reg.intercept_