In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from src.utils import save_object
from src.config import PROCESSED_DATA_PATH, MODELS_PATH
from src.data_processing import load_data, clean_data, fit_preprocessor, transform_with_preprocessor

df = load_data('../data/raw/Students Social Media Addiction.csv')
df.head(5)

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [3]:
cleaned_df = clean_data(df)

output_path = os.path.join(PROCESSED_DATA_PATH, 'cleaned_Students Social Media Addiction.csv')
cleaned_df.to_csv(output_path, index=False)

cleaned_df.head(5)

Dropped 0 duplicate or empty records.


Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [4]:
X = cleaned_df.drop(columns = ['Addicted_Score', 'Student_ID', 'Country'])
y = cleaned_df['Addicted_Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 20
)

In [5]:
preprocessor = fit_preprocessor(X_train)

X_train_processed = transform_with_preprocessor(preprocessor, X_train)
X_test_processed = transform_with_preprocessor(preprocessor, X_test)


columns_to_drop = [
    'cat__Academic_Level_High School',
    'num__Age',
    'cat__Relationship_Status_Single',
    'cat__Most_Used_Platform_YouTube',
    'cat__Academic_Level_Undergraduate',
    'cat__Most_Used_Platform_LinkedIn',
    'cat__Most_Used_Platform_TikTok',
    'cat__Most_Used_Platform_VKontakte',
    'cat__Most_Used_Platform_WeChat',
    'cat__Most_Used_Platform_Twitter',
    'cat__Most_Used_Platform_Snapchat'
]

feature_names = preprocessor.get_feature_names_out()

indices_to_drop = [np.where(feature_names == col)[0][0] for col in columns_to_drop]
X_train_processed = np.delete(X_train_processed, indices_to_drop, axis=1)
X_test_processed = np.delete(X_test_processed, indices_to_drop, axis=1)

In [6]:
save_object(X_train_processed, os.path.join(PROCESSED_DATA_PATH, 'X_train.joblib'))
save_object(X_test_processed, os.path.join(PROCESSED_DATA_PATH, 'X_test.joblib'))

save_object(y_train, os.path.join(PROCESSED_DATA_PATH, 'y_train.joblib'))
save_object(y_test, os.path.join(PROCESSED_DATA_PATH, 'y_test.joblib'))

save_object(preprocessor, os.path.join(MODELS_PATH, 'preprocessor.joblib'))

Saved object to: c:\Users\micha\OneDrive\Documents\Personal Projects\Temp\data\processed\X_train.joblib
Saved object to: c:\Users\micha\OneDrive\Documents\Personal Projects\Temp\data\processed\X_test.joblib
Saved object to: c:\Users\micha\OneDrive\Documents\Personal Projects\Temp\data\processed\y_train.joblib
Saved object to: c:\Users\micha\OneDrive\Documents\Personal Projects\Temp\data\processed\y_test.joblib
Saved object to: c:\Users\micha\OneDrive\Documents\Personal Projects\Temp\models\preprocessor.joblib
