In [1]:
import numpy as np
import os
import pandas as pd
pd.options.mode.chained_assignment = None

In [2]:
currentDir = os.getcwd()
parentDir = os.path.dirname(currentDir)
DATA_FOLDER = parentDir + "/data"
MODEL_FOLDER = parentDir + "/model"

In [3]:
data = pd.read_csv(DATA_FOLDER + "/Time_Wasters_on_Social_Media.csv")
df = pd.DataFrame(data)
df.set_index("UserID", inplace=True)

In [4]:
rColumns = [
    "Age",
    "Gender",
    "Location",
    "Income",
    "Debt",
    "Owns Property",
    "Profession",
    "Demographics",
    "Platform",
    "Total Time Spent",
    "Number of Sessions",
    "Video Category",
    "Video Length",
    "Time Spent On Video",
    "Number of Videos Watched",
    "Scroll Rate",
    "Frequency",
    "ProductivityLoss",
    "Satisfaction",
    "Watch Reason",
    "DeviceType",
    "OS",
    "Watch Time",
    "Self Control",
    "Addiction Level",
    "CurrentActivity",
    "ConnectionType"
]

cleanDf = df[rColumns]
cleanDf.to_csv(DATA_FOLDER + "/rStudioInput.csv", index=False)

In [None]:
dropCols = [
    "Gender",
    "Location",
    "Scroll Rate",
    "Watch Time",
    "Satisfaction",
    "Self Control",
    "Addiction Level"
]
for dropCol in dropCols:
    rColumns.remove(dropCol)
cleanDf = cleanDf[rColumns]

finalCols = [
    "Age",
    "Income",
    "HasDebt",
    "OwnsProperty",
    "Profession",
    "Demographics",
    "Platform",
    "MinutesSpent",
    "NumSessions",
    "Genre",
    "VideoLength",
    "VideoTime",
    "NumVideos",
    "TimeOfDay",
    "ProductivityLoss",
    "WatchReason",
    "Device",
    "OS",
    "CurrentActivity",
    "Connection"
]

cleanDf.columns = finalCols

In [6]:
#ProductivityLoss: 1-9 (0 <= Low <= 3, 4 <= Med <= 6, 7 <= High <= 10)
#Satis: 1-9
#SelfControl: 3-10
#Addiction: 0-7

def binProdLoss(score: str) -> str:
    if int(score) < 4:
        return "Low"
    elif int(score) < 7:
        return "Medium"
    else:
        return "High"

cleanDf["BinnedProdLoss"] = cleanDf["ProductivityLoss"].apply(binProdLoss)

In [7]:
def cleanActivity(activity: str) -> str:
    if "At" in activity:
        activity = activity.split(" ")[1].capitalize()
    return activity

cleanDf["CurrentActivity"] = cleanDf["CurrentActivity"].apply(cleanActivity)

In [8]:
cleanDf["Profession"][cleanDf["Profession"] == "Labor/Worker"] = "Labor"
cleanDf["Profession"][cleanDf["Profession"] == "Waiting staff"] = "WaitStaff"
cleanDf["Profession"][cleanDf["Profession"] == "driver"] = "Driver"

cleanDf["Genre"][cleanDf["Genre"] == "Jokes/Memes"] = "Memes"
cleanDf["Genre"][cleanDf["Genre"] == "Life Hacks"] = "LifeHacks"

cleanDf["Connection"][cleanDf["Connection"] == "Mobile Data"] = "MobileData"
cleanDf["Connection"][cleanDf["Connection"] == "Wi-Fi"] = "WiFi"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  cleanDf["Profession"][cleanDf["Profession"] == "Labor/Worker"] = "Labor"
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to

In [9]:
cleanDf.to_csv(MODEL_FOLDER + "/classifierInput.csv", index=False)