In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


In [2]:
data = pd.read_csv("Time_Wasters_on_Social_Media.csv")
df = pd.DataFrame(data)

In [3]:
columns = [
    "UserID",
    "Age",
    "Gender",
    "Location",
    "Income",
    "Debt",
    "Owns Property",
    "Demographics",
    "Platform",
    "Total Time Spent",
    "Number of Sessions",
    "Number of Videos Watched",
    "Scroll Rate",
    "Frequency",
    "ProductivityLoss",
    "Satisfaction",
    "Watch Reason",
    "Self Control",
    "Addiction Level",
    "CurrentActivity"
]

colsInterest = df[columns]
#colsInterest.to_csv('rStudioInput.csv', index=False)
colsInterest.set_index("UserID", inplace=True)

In [4]:
def brazil(country):
    if country == "Barzil":
        return "Brazil"
    else:
        return country
    
colsInterest["Location"] = colsInterest["Location"].apply(brazil)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest["Location"] = colsInterest["Location"].apply(brazil)


In [5]:
#ProductivityLoss: 1-9
#Satis: 1-9
#SelfControl: 3-10
#Addiction: 0-7

def bin(score: str) -> str:
    if int(score) < 4:
        return ("Low")
    elif int(score) < 8: # changed from 7 Oliver
        return ("Medium")
    else:
        return ("High")

colsInterest["BinnedProdLoss"] = colsInterest["ProductivityLoss"].apply(bin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest["BinnedProdLoss"] = colsInterest["ProductivityLoss"].apply(bin)


In [6]:
columns = [
    "Age",
    "Gender",
    "Country",
    "Income",
    "HasDebt",
    "OwnsProperty",
    "Demographics",
    "Platform",
    "MinutesSpent",
    "NumSessions",
    "NumVideos",
    "ScrollRate",
    "TimeOfDay",
    "ProductivityLoss",
    "Satisfaction",
    "WatchReason",
    "SelfControl",
    "AddictionLevel",
    "CurrentActivity",
    "BinnedProdLoss"
]

colsInterest.columns = columns

In [None]:
# colsInterest.to_csv('classifierInput.csv', index=False)

<span style="color: #FFCCCC;">Build Random Forest Classifier <span style>

Predict Productivity Loss Class given these features

In [8]:
colsInterest.head(5)

Unnamed: 0_level_0,Age,Gender,Country,Income,HasDebt,OwnsProperty,Demographics,Platform,MinutesSpent,NumSessions,NumVideos,ScrollRate,TimeOfDay,ProductivityLoss,Satisfaction,WatchReason,SelfControl,AddictionLevel,CurrentActivity,BinnedProdLoss
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,56,Male,Pakistan,82812,True,True,Rural,Instagram,80,17,22,87,Night,3,7,Procrastination,5,5,Commuting,Low
2,46,Female,Mexico,27999,False,True,Urban,Instagram,228,14,31,46,Afternoon,5,5,Habit,7,3,At school,Medium
3,32,Female,United States,42436,False,True,Rural,Facebook,30,6,7,88,Evening,6,4,Entertainment,8,2,At home,Medium
4,60,Male,Brazil,62963,True,False,Rural,YouTube,101,19,41,93,Night,3,7,Habit,5,5,Commuting,Low
5,25,Male,Pakistan,22096,False,True,Urban,TikTok,136,6,21,4,Morning,8,2,Boredom,10,0,At home,High


In [9]:
# colsInterest.head(5)

colsInterest.isna().sum() # no missing values

Age                 0
Gender              0
Country             0
Income              0
HasDebt             0
OwnsProperty        0
Demographics        0
Platform            0
MinutesSpent        0
NumSessions         0
NumVideos           0
ScrollRate          0
TimeOfDay           0
ProductivityLoss    0
Satisfaction        0
WatchReason         0
SelfControl         0
AddictionLevel      0
CurrentActivity     0
BinnedProdLoss      0
dtype: int64

In [10]:
colsInterest.columns

Index(['Age', 'Gender', 'Country', 'Income', 'HasDebt', 'OwnsProperty',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay', 'ProductivityLoss', 'Satisfaction',
       'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity',
       'BinnedProdLoss'],
      dtype='object')

In [11]:
# make hasDebt and ownsProperty Boolean

def makeBinary(x):
    if x == True:
        return 1
    return 0

colsInterest['HasDebt'] =  colsInterest['HasDebt'].apply(makeBinary)
colsInterest['OwnsProperty'] = colsInterest['OwnsProperty'].apply(makeBinary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest['HasDebt'] =  colsInterest['HasDebt'].apply(makeBinary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest['OwnsProperty'] = colsInterest['OwnsProperty'].apply(makeBinary)


In [12]:
colsInterest.head(5)

Unnamed: 0_level_0,Age,Gender,Country,Income,HasDebt,OwnsProperty,Demographics,Platform,MinutesSpent,NumSessions,NumVideos,ScrollRate,TimeOfDay,ProductivityLoss,Satisfaction,WatchReason,SelfControl,AddictionLevel,CurrentActivity,BinnedProdLoss
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,56,Male,Pakistan,82812,1,1,Rural,Instagram,80,17,22,87,Night,3,7,Procrastination,5,5,Commuting,Low
2,46,Female,Mexico,27999,0,1,Urban,Instagram,228,14,31,46,Afternoon,5,5,Habit,7,3,At school,Medium
3,32,Female,United States,42436,0,1,Rural,Facebook,30,6,7,88,Evening,6,4,Entertainment,8,2,At home,Medium
4,60,Male,Brazil,62963,1,0,Rural,YouTube,101,19,41,93,Night,3,7,Habit,5,5,Commuting,Low
5,25,Male,Pakistan,22096,0,1,Urban,TikTok,136,6,21,4,Morning,8,2,Boredom,10,0,At home,High


In [13]:
all_features = colsInterest[['Age', 'Gender', 'Country', 'Income', 'HasDebt', 'OwnsProperty',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay','Satisfaction', 'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity' ]]

no_gender = colsInterest[['Age', 'Country', 'Income', 'HasDebt', 'OwnsProperty',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay','Satisfaction', 'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity' ]]

no_location = colsInterest[['Age', 'Gender', 'Income', 'HasDebt', 'OwnsProperty',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay','Satisfaction', 'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity' ]]

no_gender_location = colsInterest[['Age', 'Income', 'OwnsProperty', 'HasDebt',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
        'TimeOfDay', 'WatchReason', 'CurrentActivity' ]] 

no_property_debt = colsInterest[['Age', 'Income', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay','Satisfaction', 'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity' ]] # no gender, location, demographics either

categorical = ['Gender', 'Country', 'Demographics', 'Platform', 'TimeOfDay',
       'WatchReason', 'CurrentActivity']
categorical_no_gender = categorical[1:]
categorical_no_location = [x for x in categorical if x != 'Country']
categorical_no_gender_location = ['Demographics', 'Platform', 'TimeOfDay',
       'WatchReason', 'CurrentActivity']
categorical_no_property_debt = ['Platform', 'TimeOfDay','WatchReason', 'CurrentActivity']

response = colsInterest['BinnedProdLoss']

In [14]:
def get_train_test_split(features, resp):
    X_train, X_test, y_train, y_test = train_test_split(features, resp, random_state=98)
    return X_train, X_test, y_train, y_test

In [15]:
# Goal is to create Random forest classifier, no preprocessing needed
# Will use GridSearchCv to find best hyperperams

def train_model(cat, X_train, y_train):

    # X_train, X_test, y_train, y_test = train_test_split(features, resp, random_state=98)


    hyperperams = {
        'randomforestclassifier__n_estimators': [100, 200, 300],
        'randomforestclassifier__max_depth': [10, 20, 30, None],
        'randomforestclassifier__min_samples_split': [2, 5, 10],
        # 'randomforestclassifier__min_samples_leaf': [1, 2, 4],
        # 'randomforestclassifier__max_features': ['sqrt', 'log2', None],
        # 'randomforestclassifier__bootstrap': [True, False],
        # 'randomforestclassifier__criterion': ['gini', 'entropy'],
        # 'randomforestclassifier__class_weight': [None, 'balanced'],
        # 'randomforestclassifier__random_state': [42]
    }

    simple_preprocessing = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'), cat), (StandardScaler(),['Age', 'Income', 'MinutesSpent', 'NumSessions', 'NumVideos']))
    
    # joblib.dump(simple_preprocessing, 'simple_preprocessing.pkl')
    
    rf = make_pipeline(simple_preprocessing, RandomForestClassifier())


    searcher = GridSearchCV(rf, 
                            param_grid=hyperperams, 
                            scoring = 'accuracy')

    searcher.fit(X_train, y_train)
    joblib.dump(searcher.best_estimator_, 'rf_pipeline.pkl')

    return searcher





In [16]:
def get_results(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

All variables

In [17]:
# X_train, X_test, y_train, y_test = get_train_test_split(all_features, response)
# model = train_model(categorical, X_train, y_train)
# get_results(model, X_test, y_test)

In [18]:
# X_train, X_test, y_train, y_test = get_train_test_split(no_gender, response)
# model_no_gender = train_model(categorical_no_gender, X_train, y_train)
# get_results(model_no_gender, X_test, y_test)

In [19]:
# X_train, X_test, y_train, y_test = get_train_test_split(no_location, response)
# model_no_location = train_model(categorical_no_location, X_train, y_train)
# get_results(model_no_location, X_test, y_test)

In [20]:
X_train, X_test, y_train, y_test = get_train_test_split(no_gender_location, response)
model_no_gender_location = train_model(categorical_no_gender_location, X_train, y_train)
get_results(model_no_gender_location, X_test, y_test)

              precision    recall  f1-score   support

        High       0.78      0.43      0.55        49
         Low       0.94      0.79      0.86        78
      Medium       0.75      0.95      0.84       123

    accuracy                           0.80       250
   macro avg       0.82      0.72      0.75       250
weighted avg       0.81      0.80      0.79       250



In [21]:
(colsInterest['BinnedProdLoss'] == 'High').sum() / colsInterest.shape[0]

0.182

In [22]:
model_no_gender_location.best_params_

{'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 100}

In [23]:
model_no_gender_location.predict(X_test)

array(['Medium', 'Low', 'Medium', 'Medium', 'Medium', 'Medium', 'Medium',
       'Medium', 'Medium', 'Medium', 'Low', 'Medium', 'Medium', 'Medium',
       'Medium', 'High', 'Medium', 'High', 'High', 'Low', 'Medium',
       'Medium', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'Low',
       'Medium', 'Medium', 'Medium', 'Medium', 'High', 'Medium', 'Medium',
       'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Low',
       'Medium', 'Medium', 'Low', 'Low', 'Medium', 'Medium', 'Medium',
       'Low', 'Medium', 'High', 'Medium', 'Medium', 'Medium', 'Low',
       'Low', 'High', 'Medium', 'Medium', 'Low', 'Medium', 'Low', 'Low',
       'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium', 'Medium',
       'Medium', 'Low', 'High', 'Medium', 'Low', 'Medium', 'Medium',
       'Low', 'Medium', 'Medium', 'Low', 'Medium', 'Low', 'High', 'Low',
       'Medium', 'Medium', 'High', 'High', 'Medium', 'Medium', 'Low',
       'Medium', 'Medium', 'Medium', 'Medium', 'Low', 'High', 'High

<span style="color: #FFCCCC;">Save Model to Disk for Use in API <span style>

In [24]:
input_data = pd.DataFrame(
    [[23, 15000, False, True, "Urban", "TikTok", 50, 800, 800, "Night", "Procrastination", "At home"]],
    columns=["Age", "Income", "OwnsProperty", "HasDebt", "Demographics", "Platform", 
             "MinutesSpent", "NumSessions", "NumVideos", "TimeOfDay",
             "WatchReason", "CurrentActivity"]
)
fun = model_no_gender_location.predict(input_data)[0]

# if fun == 'Low':
#     fun = 'High'
# elif fun == 'High':
#     fun = 'Low'

fun


'Low'

In [25]:
!pip install joblib



In [26]:


# joblib.dump(model_no_gender_location, 'random_forest_model.pkl') # we use the model with the best accuracy and F1-Score

New Model: Lasso Regression: Make future "addiction level" and predict

In [27]:
# new_df = pd.read_csv('newInput.csv')

In [28]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split

# Define numerical and categorical columns
numerical_cols = [
    "Age", "Income",
    "NumSessions", "NumVideos", "MinutesSpent"
]

categorical_cols = [
    "Demographics", "Platform", "TimeOfDay", 
     "WatchReason", "CurrentActivity", "OwnsProperty", "HasDebt"
]

resp = colsInterest['ProductivityLoss']

new_cols = ["Age", "Income", "OwnsProperty", "HasDebt", "Demographics", "Platform", 
             "NumSessions", "NumVideos", "TimeOfDay", "MinutesSpent",
             "WatchReason", "CurrentActivity"]


X_train, X_test, y_train, y_test = train_test_split(colsInterest[new_cols], resp, random_state=98)

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop="first"), categorical_cols)
])
# preprocessor = ColumnTransformer([
#     ('num', StandardScaler(), numerical_cols),
#     ('cat', make_pipeline(OneHotEncoder(drop="first", handle_unknown='ignore'), StandardScaler(with_mean=False)), categorical_cols)
# ])


# Ridge regression pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model',  Ridge())  # You can tune alpha
])

param_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1
)

# Fit the model

grid_search.fit(X_train, y_train)

# Results
print(f"Best alpha: {grid_search.best_params_['model__alpha']}")
print(f"Best CV R² score: {grid_search.best_score_:.3f}")
print(f"Test R² score: {grid_search.score(X_test, y_test):.3f}")



Best alpha: 0.1
Best CV R² score: 0.366
Test R² score: 0.502


In [29]:
y_pred = grid_search.predict(X_test)
y_pred

array([5.93289726, 3.90233193, 5.51645202, 6.10809292, 5.45393643,
       5.94611338, 5.77872647, 6.07452058, 5.07805711, 5.79721487,
       3.58718979, 5.51299721, 5.96369817, 5.44112158, 5.62915497,
       7.49139739, 5.09734321, 7.12928893, 7.05444647, 3.4591867 ,
       5.75550931, 5.45332808, 6.31003003, 3.28414719, 6.24185668,
       6.21787423, 5.8752832 , 3.10814249, 5.78990048, 6.11802971,
       6.36530406, 5.57386739, 7.22950885, 5.29411737, 5.30288188,
       5.25540964, 5.75451014, 5.70974383, 5.42253278, 6.04816217,
       4.98975311, 3.99968738, 5.833782  , 5.41223971, 3.89109116,
       3.36051999, 4.92534632, 5.04201204, 5.94759151, 3.63325576,
       4.92882209, 7.38845038, 5.40644498, 5.5300847 , 5.63045507,
       3.35788047, 3.43369674, 7.18001794, 5.84721834, 5.95483694,
       3.11570841, 5.34096515, 3.35170485, 3.34876693, 3.26395556,
       6.37506579, 6.40629882, 4.97894757, 7.66577133, 5.64379746,
       6.16766895, 6.1711072 , 3.26659985, 7.23949587, 5.49540

In [30]:
from sklearn.metrics import root_mean_squared_error, r2_score
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

RMSE: 1.46
R²: 0.502


In [31]:
input_data = pd.DataFrame(
    [[23, 15000, False, True, "Urban", "TikTok", 1, 20, 100, "Morning", "Procrastination", "At home"]],
    columns=["Age", "Income", "OwnsProperty", "HasDebt", "Demographics", "Platform", 
             "MinutesSpent", "NumSessions", "NumVideos", "TimeOfDay", "WatchReason", "CurrentActivity" ])

grid_search.predict(input_data)[0]

7.715224877790152

In [32]:
best_model = grid_search.best_params_
best_model

{'model__alpha': 0.1}

In [33]:
# joblib.dump(grid_search, 'lasso_model.pkl')

In [34]:
model = grid_search.best_estimator_.named_steps['model']
preprocessor = grid_search.best_estimator_.named_steps['preprocess']
feature_names = preprocessor.get_feature_names_out()

# Print non-zero coefficients
for name, coef in zip(feature_names, model.coef_):
    if coef != 0:
        print(f"{name}: {coef:.4f}")

num__Age: -0.0703
num__Income: -0.0420
num__NumSessions: 0.1318
num__NumVideos: -0.0123
num__MinutesSpent: 0.0020
cat__Demographics_Urban: 0.1014
cat__Platform_Instagram: 0.0827
cat__Platform_TikTok: 0.0847
cat__Platform_YouTube: 0.1100
cat__TimeOfDay_Evening: 0.4915
cat__TimeOfDay_Morning: 2.0312
cat__TimeOfDay_Night: -1.9992
cat__WatchReason_Entertainment: -0.0047
cat__WatchReason_Habit: -0.1108
cat__WatchReason_Procrastination: -0.2108
cat__CurrentActivity_At school: 0.2368
cat__CurrentActivity_At work: 0.0222
cat__CurrentActivity_Commuting: 0.2989
cat__OwnsProperty_1: 0.0509
cat__HasDebt_1: 0.4314


**RidgeClassifier : This was the final model we used** 

In [35]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
X_train, X_test, y_train, y_test = train_test_split(colsInterest[new_cols], colsInterest['BinnedProdLoss'], random_state=98)

y_train_encoded = encoder.fit_transform(y_train.to_frame()).ravel()
y_test_encoded = encoder.transform(y_test.to_frame()).ravel()





In [36]:
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from mord import OrdinalRidge

# Use your preprocessor from before
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', RidgeClassifier()) 
])

pipeline.fit(X_train, y_train_encoded)

y_pred_encoded = pipeline.predict(X_test)

y_pred_labels = encoder.inverse_transform(y_pred_encoded.reshape(-1, 1)).ravel()
y_test_labels = encoder.inverse_transform(y_test_encoded.reshape(-1, 1)).ravel()

# Show precision, recall, f1 per class
print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

        High       0.79      0.45      0.57        49
         Low       0.94      0.79      0.86        78
      Medium       0.75      0.95      0.84       123

    accuracy                           0.80       250
   macro avg       0.83      0.73      0.76       250
weighted avg       0.82      0.80      0.79       250



In [76]:
input_data = pd.DataFrame(
    [[23, 15000, False, True, "Urban", "TikTok", 0, 0, 0, "Morning", "Procrastination", "At work"]],
    columns=["Age", "Income", "OwnsProperty", "HasDebt", "Demographics", "Platform", 
             "MinutesSpent", "NumSessions", "NumVideos", "TimeOfDay", "WatchReason", "CurrentActivity" ])
pipeline.predict(input_data)

array([2.])

In [38]:
# joblib.dump(pipeline, 'ridge_classifier.pkl')