In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv("Time_Wasters_on_Social_Media.csv")
df = pd.DataFrame(data)

In [3]:
columns = [
    "UserID",
    "Age",
    "Gender",
    "Location",
    "Income",
    "Debt",
    "Owns Property",
    "Demographics",
    "Platform",
    "Total Time Spent",
    "Number of Sessions",
    "Number of Videos Watched",
    "Scroll Rate",
    "Frequency",
    "ProductivityLoss",
    "Satisfaction",
    "Watch Reason",
    "Self Control",
    "Addiction Level",
    "CurrentActivity"
]

colsInterest = df[columns]
#colsInterest.to_csv('rStudioInput.csv', index=False)
colsInterest.set_index("UserID", inplace=True)

In [4]:
def brazil(country):
    if country == "Barzil":
        return "Brazil"
    else:
        return country
    
colsInterest["Location"] = colsInterest["Location"].apply(brazil)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest["Location"] = colsInterest["Location"].apply(brazil)


In [5]:
#ProductivityLoss: 1-9 (0 <= Low <= 3, 4 <= Med <= 6, 7 <= High <= 10)
#Satis: 1-9
#SelfControl: 3-10
#Addiction: 0-7

def bin(score: str) -> str:
    if int(score) < 4:
        return ("Low")
    elif int(score) < 8: # changed from 7 Oliver
        return ("Medium")
    else:
        return ("High")

colsInterest["BinnedProdLoss"] = colsInterest["ProductivityLoss"].apply(bin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest["BinnedProdLoss"] = colsInterest["ProductivityLoss"].apply(bin)


In [6]:
columns = [
    "Age",
    "Gender",
    "Country",
    "Income",
    "HasDebt",
    "OwnsProperty",
    "Demographics",
    "Platform",
    "MinutesSpent",
    "NumSessions",
    "NumVideos",
    "ScrollRate",
    "TimeOfDay",
    "ProductivityLoss",
    "Satisfaction",
    "WatchReason",
    "SelfControl",
    "AddictionLevel",
    "CurrentActivity",
    "BinnedProdLoss"
]

colsInterest.columns = columns

In [7]:
colsInterest.to_csv('classifierInput.csv', index=False)

<span style="color: #FFCCCC;">Build Random Forest Classifier <span style>

Predict Productivity Loss Class given these features

In [8]:
colsInterest.head(5)

Unnamed: 0_level_0,Age,Gender,Country,Income,HasDebt,OwnsProperty,Demographics,Platform,MinutesSpent,NumSessions,NumVideos,ScrollRate,TimeOfDay,ProductivityLoss,Satisfaction,WatchReason,SelfControl,AddictionLevel,CurrentActivity,BinnedProdLoss
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,56,Male,Pakistan,82812,True,True,Rural,Instagram,80,17,22,87,Night,3,7,Procrastination,5,5,Commuting,Low
2,46,Female,Mexico,27999,False,True,Urban,Instagram,228,14,31,46,Afternoon,5,5,Habit,7,3,At school,Medium
3,32,Female,United States,42436,False,True,Rural,Facebook,30,6,7,88,Evening,6,4,Entertainment,8,2,At home,Medium
4,60,Male,Brazil,62963,True,False,Rural,YouTube,101,19,41,93,Night,3,7,Habit,5,5,Commuting,Low
5,25,Male,Pakistan,22096,False,True,Urban,TikTok,136,6,21,4,Morning,8,2,Boredom,10,0,At home,High


In [9]:
# colsInterest.head(5)

colsInterest.isna().sum() # no missing values

Age                 0
Gender              0
Country             0
Income              0
HasDebt             0
OwnsProperty        0
Demographics        0
Platform            0
MinutesSpent        0
NumSessions         0
NumVideos           0
ScrollRate          0
TimeOfDay           0
ProductivityLoss    0
Satisfaction        0
WatchReason         0
SelfControl         0
AddictionLevel      0
CurrentActivity     0
BinnedProdLoss      0
dtype: int64

In [10]:
colsInterest.columns

Index(['Age', 'Gender', 'Country', 'Income', 'HasDebt', 'OwnsProperty',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay', 'ProductivityLoss', 'Satisfaction',
       'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity',
       'BinnedProdLoss'],
      dtype='object')

In [11]:
# make hasDebt and ownsProperty Boolean

def makeBinary(x):
    if x == True:
        return 1
    return 0

colsInterest['HasDebt'] =  colsInterest['HasDebt'].apply(makeBinary)
colsInterest['OwnsProperty'] = colsInterest['OwnsProperty'].apply(makeBinary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest['HasDebt'] =  colsInterest['HasDebt'].apply(makeBinary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colsInterest['OwnsProperty'] = colsInterest['OwnsProperty'].apply(makeBinary)


In [12]:
colsInterest.head(5)

Unnamed: 0_level_0,Age,Gender,Country,Income,HasDebt,OwnsProperty,Demographics,Platform,MinutesSpent,NumSessions,NumVideos,ScrollRate,TimeOfDay,ProductivityLoss,Satisfaction,WatchReason,SelfControl,AddictionLevel,CurrentActivity,BinnedProdLoss
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,56,Male,Pakistan,82812,1,1,Rural,Instagram,80,17,22,87,Night,3,7,Procrastination,5,5,Commuting,Low
2,46,Female,Mexico,27999,0,1,Urban,Instagram,228,14,31,46,Afternoon,5,5,Habit,7,3,At school,Medium
3,32,Female,United States,42436,0,1,Rural,Facebook,30,6,7,88,Evening,6,4,Entertainment,8,2,At home,Medium
4,60,Male,Brazil,62963,1,0,Rural,YouTube,101,19,41,93,Night,3,7,Habit,5,5,Commuting,Low
5,25,Male,Pakistan,22096,0,1,Urban,TikTok,136,6,21,4,Morning,8,2,Boredom,10,0,At home,High


In [13]:
# Goal is to create Random forest classifier, no preprocessing needed
# Will use GridSearchCv to find best hyperperams


X_train, X_test, y_train, y_test = train_test_split(colsInterest[['Age', 'Gender', 'Country', 'Income', 'HasDebt', 'OwnsProperty',
       'Demographics', 'Platform', 'MinutesSpent', 'NumSessions', 'NumVideos',
       'ScrollRate', 'TimeOfDay','Satisfaction', 'WatchReason', 'SelfControl', 'AddictionLevel', 'CurrentActivity' ]], 
        colsInterest['BinnedProdLoss'], random_state=98)


hyperperams = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [10, 20, 30, None],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    # 'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    # 'randomforestclassifier__max_features': ['sqrt', 'log2', None],
    # 'randomforestclassifier__bootstrap': [True, False],
    # 'randomforestclassifier__criterion': ['gini', 'entropy'],
    # 'randomforestclassifier__class_weight': [None, 'balanced'],
    # 'randomforestclassifier__random_state': [42]
}

simple_preprocessing = make_column_transformer(
    (OneHotEncoder(drop='first', handle_unknown='ignore'),
        ['Gender', 'Country', 'Demographics', 'Platform', 'TimeOfDay',
       'WatchReason', 'CurrentActivity']))
rf = make_pipeline(simple_preprocessing, RandomForestClassifier())


searcher = GridSearchCV(rf, 
                        param_grid=hyperperams, 
                        scoring = 'accuracy')

searcher.fit(X_train, y_train)





In [14]:
searcher.best_params_

{'randomforestclassifier__max_depth': 30,
 'randomforestclassifier__min_samples_split': 10,
 'randomforestclassifier__n_estimators': 200}

In [15]:
y_pred = searcher.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8

In [17]:
f1 = f1_score(y_test, y_pred, average= 'macro')
f1

0.7423056859676578

In [18]:
from sklearn.metrics import classification_report

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        High       0.83      0.39      0.53        49
         Low       0.94      0.79      0.86        78
      Medium       0.74      0.97      0.84       123

    accuracy                           0.80       250
   macro avg       0.83      0.72      0.74       250
weighted avg       0.82      0.80      0.78       250



In [20]:
(colsInterest['BinnedProdLoss'] == 'High').sum() / colsInterest.shape[0]

0.182

<span style="color: #FFCCCC;">Save Model to Disk for Use in API <span style>

In [21]:
!pip install joblib



In [22]:
import joblib

joblib.dump(searcher, 'random_forest_model.pkl')

['random_forest_model.pkl']