### Feature Engineering and preprocessing of the data for the Regression Task

This notebook covers the feature engineering and preprocessing steps for my regression task. The Country column was grouped into continents, and ordinal encoding was applied to columns with inherent rankings, such as Academic Level. One-hot encoding (via get_dummies) was used for other categorical features. After completing the feature engineering and preprocessing, the dataset was split into training and testing sets and saved as CSV files to the Data folder.

In [1]:
## All libraries used

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
sns.set()
import numpy as np
import country_converter as coco
from datetime import datetime
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action= 'ignore')

In [3]:
## Load the data

df = pd.read_csv(r'/Users/sot/SDS-CP029-social-sphere/submissions/team-members/Patrick-Edosoma/Data/Raw/Students Social Media Addiction.csv')

In [4]:
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [5]:
# column names normalization

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
# I noticed that the country column has UAE as a country. I will replace it with United Arab Emirates 
# Important to change this because UAE is not a country in the country_converter library, 
# Using UAE like that  will convert the rows of UAE to unknown 

df['country'] = df['country'].replace({
    'UAE': 'United Arab Emirates'
   
})

In [7]:
# converting the country column to continent
df['continent'] = df['country'].apply(lambda x: coco.convert(names=x, to='continent'))

In [8]:
# Dropping the country column because it is not a relevant feature for the model
df.drop('country',axis =1,inplace=True)

In [9]:
# I will encode the categorical columns

academic_level= ['High School', 'Undergraduate', 'Graduate']
gender = ['Female', 'Male']
affects_academic_performance= ['No', 'Yes']

encoder = OrdinalEncoder(categories=[academic_level, gender, affects_academic_performance])

categorical_cols = ['academic_level', 'gender', 'affects_academic_performance']

df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

In [10]:
# I will convert the rest of the categorical columns to dummy variables

df = pd.get_dummies(df, drop_first=True, dtype=int)


In [11]:
df.head()

Unnamed: 0,student_id,age,gender,academic_level,avg_daily_usage_hours,affects_academic_performance,sleep_hours_per_night,mental_health_score,conflicts_over_social_media,addicted_score,...,most_used_platform_VKontakte,most_used_platform_WeChat,most_used_platform_WhatsApp,most_used_platform_YouTube,relationship_status_In Relationship,relationship_status_Single,continent_America,continent_Asia,continent_Europe,continent_Oceania
0,1,19,0.0,1.0,5.2,1.0,6.5,6,3,8,...,0,0,0,0,1,0,0,1,0,0
1,2,22,1.0,2.0,2.1,0.0,7.5,8,0,3,...,0,0,0,0,0,1,0,1,0,0
2,3,20,0.0,1.0,6.0,1.0,5.0,5,4,9,...,0,0,0,0,0,0,1,0,0,0
3,4,18,1.0,0.0,3.0,0.0,7.0,7,1,4,...,0,0,0,1,0,1,0,0,1,0
4,5,21,1.0,2.0,4.5,1.0,6.0,6,2,7,...,0,0,0,0,1,0,1,0,0,0


In [12]:
## Splitting the data into train and test and saving them in the processed folder



# Split the data (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define your output directory
output_dir = "/Users/sot/SDS-CP029-social-sphere/submissions/team-members/Patrick-Edosoma/regression_processed_data"

# Make sure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save train and test CSVs
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)

In [13]:
df.corr()

Unnamed: 0,student_id,age,gender,academic_level,avg_daily_usage_hours,affects_academic_performance,sleep_hours_per_night,mental_health_score,conflicts_over_social_media,addicted_score,...,most_used_platform_VKontakte,most_used_platform_WeChat,most_used_platform_WhatsApp,most_used_platform_YouTube,relationship_status_In Relationship,relationship_status_Single,continent_America,continent_Asia,continent_Europe,continent_Oceania
student_id,1.0,0.222306,-0.001087,0.194221,0.267524,0.05378,0.173793,-0.055037,0.173258,0.041637,...,-0.052373,0.172763,0.141071,-0.173932,0.007454,0.125034,0.109344,-0.231217,0.179396,-0.01892
age,0.222306,1.0,0.49471,0.824932,-0.113682,-0.13714,0.125265,0.160278,-0.184482,-0.166396,...,0.126151,0.092137,0.077751,-0.039426,0.145176,-0.10038,0.010351,-0.06452,0.061523,-0.020483
gender,-0.001087,0.49471,1.0,0.58223,-0.073582,-0.024736,0.046946,0.046534,-0.089126,-0.049692,...,0.131777,0.069015,0.171081,0.120122,0.038675,-0.032628,0.011865,0.00695,0.001145,-0.048693
academic_level,0.194221,0.824932,0.58223,1.0,-0.12556,-0.091373,0.201456,0.175512,-0.188919,-0.16772,...,0.134163,0.098247,0.190029,-0.131913,0.17746,-0.087098,0.03306,0.001725,-0.00261,-0.033127
avg_daily_usage_hours,0.267524,-0.113682,-0.073582,-0.12556,1.0,0.661474,-0.790582,-0.801058,0.804582,0.832,...,-0.070034,0.004844,0.356934,-0.080069,0.008008,0.00637,0.336116,0.088889,-0.316473,-0.063055
affects_academic_performance,0.05378,-0.13714,-0.024736,-0.091373,0.661474,1.0,-0.625373,-0.808921,0.83203,0.866049,...,-0.17643,-0.033602,0.214812,-0.085738,-0.178718,0.156066,0.288289,0.125253,-0.306626,-0.104452
sleep_hours_per_night,0.173793,0.125265,0.046946,0.201456,-0.790582,-0.625373,1.0,0.707439,-0.677266,-0.764858,...,0.102961,0.064299,-0.255403,-0.05528,-0.028743,0.106815,-0.266792,-0.150272,0.336204,0.087541
mental_health_score,-0.055037,0.160278,0.046534,0.175512,-0.801058,-0.808921,0.707439,1.0,-0.893572,-0.945051,...,0.09212,0.032007,-0.179939,0.040523,0.053309,-0.028757,-0.233251,-0.141161,0.278366,0.110866
conflicts_over_social_media,0.173258,-0.184482,-0.089126,-0.188919,0.804582,0.83203,-0.677266,-0.893572,1.0,0.933586,...,-0.116794,-0.048719,0.184522,-0.081403,-0.076968,0.058722,0.204951,0.129015,-0.252262,-0.082596
addicted_score,0.041637,-0.166396,-0.049692,-0.16772,0.832,0.866049,-0.764858,-0.945051,0.933586,1.0,...,-0.119215,-0.034416,0.186327,-0.025478,-0.049566,0.014795,0.297544,0.119873,-0.310097,-0.106016


X = df.drop(['addicted_score', 'student_id'], axis = 1)
y = df['addicted_score']

In [14]:
X = df.drop(['addicted_score', 'student_id'], axis = 1)
y = df['addicted_score']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [16]:

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [17]:

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
def train_and_predict(X_train, X_test, y_train, y_test):
    
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "ElasticNet Regression": ElasticNet(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "AdaBoost": AdaBoostRegressor(),
        "XGBoost": XGBRegressor(),
        "K-Nearest Neighbors": KNeighborsRegressor()
    }

    results = []
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_r2 = r2_score(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

        cv_r2 = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='r2'))
        cv_rmse = np.mean(np.sqrt(-cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')))

        test_r2 = r2_score(y_test, y_test_pred)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

        r2_gap = abs(train_r2 - test_r2)
        rmse_gap = abs(train_rmse - test_rmse)

        results.append([model_name, train_r2, cv_r2, test_r2, train_rmse, cv_rmse, test_rmse, r2_gap, rmse_gap])

    results_df = pd.DataFrame(results, columns=["Model", "Train R2", "Train CV R2", "Test R2", "Train RMSE", "Train CV RMSE", "Test RMSE", "R2 Gap", "RMSE Gap"])
    return results_df

In [20]:
results_without_tuning = train_and_predict(X_train_scaled, X_test_scaled, y_train, y_test)

In [21]:

results_without_tuning

Unnamed: 0,Model,Train R2,Train CV R2,Test R2,Train RMSE,Train CV RMSE,Test RMSE,R2 Gap,RMSE Gap
0,Linear Regression,0.970257,0.966725,0.968186,0.27363,0.286912,0.28215,0.002072,0.00852
1,Ridge Regression,0.969844,0.9668,0.967847,0.275523,0.286909,0.283649,0.001998,0.008126
2,Lasso Regression,0.0,-0.009029,-0.002895,1.586625,1.587945,1.58415,0.002895,0.002475
3,ElasticNet Regression,0.105913,0.097763,0.107181,1.500252,1.501812,1.494687,0.001269,0.005565
4,Decision Tree,1.0,0.971093,0.974491,0.0,0.244757,0.252646,0.025509,0.252646
5,Random Forest,0.997954,0.982821,0.985326,0.071771,0.205664,0.191619,0.012627,0.119848
6,Gradient Boosting,0.991024,0.979122,0.979737,0.150319,0.22704,0.225176,0.011287,0.074856
7,AdaBoost,0.953987,0.943982,0.946007,0.34034,0.361379,0.367567,0.00798,0.027227
8,XGBoost,0.99998,0.98414,0.981084,0.007038,0.197023,0.217561,0.018896,0.210523
9,K-Nearest Neighbors,0.958529,0.917834,0.923701,0.323105,0.448424,0.436946,0.034828,0.113841
