In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, BayesianRidge, ElasticNetCV

In [22]:
#read csv file and store in data Data Frame
data =pd.read_csv("C:\\Users\\Zel\\Desktop\\191 Machine Learning\\dataset\\Final Data.csv", delimiter=',')
data.head()

Unnamed: 0,Gender,Age,Education_Level,Institution_Type,IT_Student,Location,Load-shedding,Financial_Condition,Internet_Type,Network_Type,Class_Duration,Self_Lms,Device,Adaptivity_Level
0,Girl,21 - 25,University,Non Government,No,Yes,2,Mid,Mobile Data,4G,1 - 3,Yes,Mobile,1
1,Girl,16 - 20,College,Government,No,Yes,0,Mid,Wifi,4G,1 - 3,No,Mobile,1
2,Girl,11 - 15,School,Non Government,No,Yes,0,Mid,Mobile Data,4G,1 - 3,No,Mobile,1
3,Boy,11 - 15,School,Non Government,No,Yes,0,Poor,Mobile Data,3G,1 - 3,No,Mobile,0
4,Girl,16 - 20,School,Non Government,No,Yes,0,Poor,Mobile Data,3G,0,No,Mobile,0


In [23]:
#Data Frame checking
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               901 non-null    object
 1   Age                  901 non-null    object
 2   Education_Level      901 non-null    object
 3   Institution_Type     901 non-null    object
 4   IT_Student           901 non-null    object
 5   Location             901 non-null    object
 6   Load-shedding        901 non-null    int64 
 7   Financial_Condition  901 non-null    object
 8   Internet_Type        901 non-null    object
 9   Network_Type         901 non-null    object
 10  Class_Duration       901 non-null    object
 11  Self_Lms             901 non-null    object
 12  Device               901 non-null    object
 13  Adaptivity_Level     901 non-null    int64 
dtypes: int64(2), object(12)
memory usage: 98.7+ KB


In [24]:
#Store all object type (categorical columns) to a new data frame data_category
category_columns = data.select_dtypes(['object']).columns
data_category = data[category_columns]
data_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               901 non-null    object
 1   Age                  901 non-null    object
 2   Education_Level      901 non-null    object
 3   Institution_Type     901 non-null    object
 4   IT_Student           901 non-null    object
 5   Location             901 non-null    object
 6   Financial_Condition  901 non-null    object
 7   Internet_Type        901 non-null    object
 8   Network_Type         901 non-null    object
 9   Class_Duration       901 non-null    object
 10  Self_Lms             901 non-null    object
 11  Device               901 non-null    object
dtypes: object(12)
memory usage: 84.6+ KB


In [25]:
#Using one-hot-encoding to convert object type variables to numeric
dummies = pd.get_dummies( data[category_columns],drop_first=True)
for i in data_category.columns:
    if len(data_category.groupby([i]).size()) >2:
        data_category = pd.get_dummies(data_category, prefix=[i], columns=[i])
data_category = pd.get_dummies(data_category, drop_first=True)

In [26]:
#Checking the converted object types if everything is numeric
data_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Age_1 - 5                        901 non-null    uint8
 1   Age_11 - 15                      901 non-null    uint8
 2   Age_16 - 20                      901 non-null    uint8
 3   Age_21 - 25                      901 non-null    uint8
 4   Age_26 - 30                      901 non-null    uint8
 5   Age_6 - 10                       901 non-null    uint8
 6   Education_Level_College          901 non-null    uint8
 7   Education_Level_School           901 non-null    uint8
 8   Education_Level_University       901 non-null    uint8
 9   Financial_Condition_Mid          901 non-null    uint8
 10  Financial_Condition_Poor         901 non-null    uint8
 11  Financial_Condition_Rich         901 non-null    uint8
 12  Network_Type_2G                  901 non-null    u

In [27]:
#Storing all numeric type in the data set to data_ and converting the data type to float64
data_ = data.drop(category_columns, axis=1).astype('float64')
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Load-shedding     901 non-null    float64
 1   Adaptivity_Level  901 non-null    float64
dtypes: float64(2)
memory usage: 14.2 KB


In [28]:
#Setting all Dtype of the converted object type to float64 for uniformity
data_category =data_category.astype('float64')

In [29]:
#Checking the conversion
data_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age_1 - 5                        901 non-null    float64
 1   Age_11 - 15                      901 non-null    float64
 2   Age_16 - 20                      901 non-null    float64
 3   Age_21 - 25                      901 non-null    float64
 4   Age_26 - 30                      901 non-null    float64
 5   Age_6 - 10                       901 non-null    float64
 6   Education_Level_College          901 non-null    float64
 7   Education_Level_School           901 non-null    float64
 8   Education_Level_University       901 non-null    float64
 9   Financial_Condition_Mid          901 non-null    float64
 10  Financial_Condition_Poor         901 non-null    float64
 11  Financial_Condition_Rich         901 non-null    float64
 12  Network_Type_2G       

In [30]:
#Storing the dependent variable to y
y = data_.Adaptivity_Level

In [31]:
#Checking the transfer
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 901 entries, 0 to 900
Series name: Adaptivity_Level
Non-Null Count  Dtype  
--------------  -----  
901 non-null    float64
dtypes: float64(1)
memory usage: 7.2 KB


In [32]:
#Removing the dependent variable in the data frame containing the other numeric independent variable
data_ = data_.drop(['Adaptivity_Level'], axis = 1)

In [33]:
#Checking the removal
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Load-shedding  901 non-null    float64
dtypes: float64(1)
memory usage: 7.2 KB


In [34]:
#Combining all independent variables
X = X = pd.concat([data_, data_category], axis=1)

In [35]:
#Checking the combining process
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Load-shedding                    901 non-null    float64
 1   Age_1 - 5                        901 non-null    float64
 2   Age_11 - 15                      901 non-null    float64
 3   Age_16 - 20                      901 non-null    float64
 4   Age_21 - 25                      901 non-null    float64
 5   Age_26 - 30                      901 non-null    float64
 6   Age_6 - 10                       901 non-null    float64
 7   Education_Level_College          901 non-null    float64
 8   Education_Level_School           901 non-null    float64
 9   Education_Level_University       901 non-null    float64
 10  Financial_Condition_Mid          901 non-null    float64
 11  Financial_Condition_Poor         901 non-null    float64
 12  Financial_Condition_Ri

In [16]:
#data[category_columns] = data[category_columns].apply(lambda x: x.astype('category'))
#data[category_columns] = data[category_columns].apply(lambda x: x.category.codes)

In [36]:
lm=LinearRegression()
lasso = LassoCV(fit_intercept=True,alphas=2**np.linspace(3,-7,100)*0.5, cv=5)
ridge = RidgeCV(fit_intercept=True,alphas=2**np.linspace(3,-7,100)*0.5,cv=5, scoring='neg_mean_squared_error')
elasticnet = ElasticNetCV(fit_intercept=True,alphas=2**np.linspace(3,-7,100)*0.5, cv=5)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [38]:
# Linear Regression
lm.fit(X_train,y_train)
lmModelpred = lm.predict(X_test)
print("The MSE is: " ,mean_squared_error(y_test, lmModelpred))

The MSE is:  0.24657395696113124


In [39]:
# Lasso
lasso.fit(X_train, y_train)
lassoModelpred = lasso.predict(X_test)
print("The MSE is: " ,mean_squared_error(y_test, lassoModelpred))

The MSE is:  0.2475851449025785


In [40]:
# Ridge regression
ridge.fit(X_train, y_train)
ridgeModelpred = ridge.predict(X_test)
print("The MSE is: " ,mean_squared_error(y_test, ridgeModelpred))

The MSE is:  0.2452820604448481


In [43]:
# Elastic-Net
elasticnet.fit(X_train, y_train)
elasticnetModelpred = elasticnet.predict(X_test)
print("The MSE is: " ,mean_squared_error(y_test, elasticnetModelpred))

The MSE is:  0.24566932367313596


In [44]:
pd.Series(ridge.coef_,index=X.columns)

Load-shedding                     -0.055213
Age_1 - 5                         -0.028047
Age_11 - 15                       -0.032283
Age_16 - 20                       -0.124216
Age_21 - 25                        0.094426
Age_26 - 30                       -0.024088
Age_6 - 10                         0.114208
Education_Level_College           -0.041777
Education_Level_School             0.109210
Education_Level_University        -0.067434
Financial_Condition_Mid           -0.093050
Financial_Condition_Poor          -0.232721
Financial_Condition_Rich           0.325771
Network_Type_2G                   -0.200016
Network_Type_3G                    0.029044
Network_Type_4G                    0.170973
Class_Duration_0                  -0.442348
Class_Duration_1 - 3               0.086224
Class_Duration_3 - 6               0.356124
Device_Computer                   -0.063511
Device_Mobile                      0.092793
Device_Tab                        -0.029282
Gender_Girl                     

In [45]:
# Separate positive and negative coefficients
ridge_coeffs = pd.Series(ridge.coef_,index=X.columns)
positive_coeffs = ridge_coeffs[ridge_coeffs > 0].sort_values(ascending=False)
negative_coeffs = ridge_coeffs[ridge_coeffs < 0].sort_values()

In [46]:
# DataFrames for positive and negative coefficients
positive_df = pd.DataFrame({'Coefficient': positive_coeffs})
negative_df = pd.DataFrame({'Coefficient': negative_coeffs})

In [47]:
#Printing all values
print("Positive Coefficients:")
print(positive_df)

print("\nNegative Coefficients:")
print(negative_df)

Positive Coefficients:
                                 Coefficient
Class_Duration_3 - 6                0.356124
Financial_Condition_Rich            0.325771
Network_Type_4G                     0.170973
Institution_Type_Non Government     0.170348
Self_Lms_Yes                        0.130257
Age_6 - 10                          0.114208
Education_Level_School              0.109210
Age_21 - 25                         0.094426
Device_Mobile                       0.092793
Class_Duration_1 - 3                0.086224
Location_Yes                        0.065835
Network_Type_3G                     0.029044

Negative Coefficients:
                            Coefficient
Class_Duration_0              -0.442348
Financial_Condition_Poor      -0.232721
Network_Type_2G               -0.200016
Age_16 - 20                   -0.124216
Financial_Condition_Mid       -0.093050
Gender_Girl                   -0.067563
Education_Level_University    -0.067434
Device_Computer               -0.063511
Load-she