In [1]:
import json
import pandas as pd

In [133]:
with open('filtered_data.json', 'r') as file:
    data = json.load(file)

In [165]:
df = pd.DataFrame(data)

In [156]:
len(df)

10101

In [167]:
df = df.drop('id', axis=1)
df.head()

Unnamed: 0,TaskTitle,ListTitle
0,rearrange closet,home
1,meeting tasks,school-work
2,taste of home,shopping
3,bring book in,default list
4,sociology paper,today


In [158]:
#Chech how many tags are in the dataset
variety = df['ListTitle'].value_counts()
print(len(variety))
print(variety.head(10))

293
ListTitle
default list    5393
to do           1367
work            1252
today            343
home             167
family           157
to do list       143
school           140
groceries        108
house             86
Name: count, dtype: int64


In [166]:
#Group tags
home_tags = ['family', 'chores', 'home', 'house projects', 'house', 'errands', 'projects', 'health', 'home improvement', 'car', 'house to do', 'household', 'life', 'around the house', 'home to do', 'new house', 'workout', 'house chores', 'makeup', 'morning routine', 'bedroom', 'home improvements', 'house stuff', 'cleaning', 'life admin', 'admin']
shopping_tags = ['shopping', 'groceries', 'packing', 'packing list', 'home depot', 'camping', 'camping list', 'to pack', 'trip', 'grocery', 'shopping', 'bunnings', 'to bring', 'bring', 'grocery list', 'food shopping', 'stuff to bring', 'trip packing list', 'trip list', 'target', 'camping checklist', 'ikea', 'things to pack', 'camping food', 'camping trip', 'camping equipment', 'makeup', 'bring from home', 'camping to do', 'gifts', 'walmart']
school_and_work = ['school-work', 'school', 'homework', 'social studies', 'school work', 'physics', 'marketing', 'school to do', 'learning', 'uni', 'assignments', 'to read', 'teaching', 'university', 'science', 'home work', 'tests', 'read', 'work', 'career', 'work to do']
to_tag = pd.DataFrame({"TaskTitle": [],
                      "ListTitle": []})

for i in range(len(df)):
    if df['ListTitle'][i] in home_tags:
        df['ListTitle'][i] = 'home'
    elif df['ListTitle'][i] in shopping_tags:
        df['ListTitle'][i] = 'shopping'
    elif df['ListTitle'][i] in school_and_work:
        df['ListTitle'][i] = 'school-work'
    else:
        untagged = df.iloc[[i]]
        to_tag = pd.concat([to_tag, untagged])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ListTitle'][i] = 'home'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ListTitle'][i] = 'school-work'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ListTitle'][i] = 'shopping'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ListTitle'][i] = 'home'
A value is trying to be set on a copy of a sl

In [159]:
#Lists of keywords
home_words = ['clean ', 'fix ']
shopping_words = ['buy ', 'get ', 'order ', 'shop']
studywork_words = ['email', 'test', 'assignment', 'study', 'train']
to_remove = ['call ', 'meet ', 'talk ']

In [168]:
#Guess a tag based on a keyword
for i in studywork_words:
    condition = (df['TaskTitle'].str.contains(i)) & (~df['ListTitle'].isin(['home', 'shopping']))
    df.loc[condition, 'ListTitle'] = 'school-work'

In [169]:
for i in home_words:
    condition = (df['TaskTitle'].str.contains(i)) & (~df['ListTitle'].isin(['school-work', 'shopping']))
    df.loc[condition, 'ListTitle'] = 'home'

In [170]:
for i in shopping_words:
    condition = (df['TaskTitle'].str.contains(i)) & (~df['ListTitle'].isin(['home', 'school-work']))
    df.loc[condition, 'ListTitle'] = 'shopping'

In [171]:
#Remove useless names
for i in to_remove:
    df = df[~df['TaskTitle'].str.contains(i)]

In [173]:
#Remove all untagged values
tags = ['home', 'school-work', 'shopping']
df = df[df['ListTitle'].isin(tags)]

In [186]:
len(df)

3599

In [175]:
#Saving a json file
with open('3cat_df.json', 'w', encoding='utf-8') as json_file:
    json.dump(df.to_dict(orient='records'), json_file, ensure_ascii=False, indent=4)

In [180]:
#imports for further preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

In [177]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [178]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

In [179]:
df['TaskTitle'] = df['TaskTitle'].apply(remove_stop_words)
df['TaskTitle'] = df['TaskTitle'].apply(lemmatize_text)

In [184]:
#Turning labels into numbers
#0 - home, 1 - school-work, 2 - shopping
label_encoder = LabelEncoder()
df['ListTitle'] = label_encoder.fit_transform(df['ListTitle'])
df.head()

Unnamed: 0,TaskTitle,ListTitle
1568,capex update,1
5171,buy lens cleaner,2
9883,jc prep,1
8361,pick shirt dry cleaner,0
7130,apply parking permit,1


In [185]:
#shuffling the df
df = df.sample(frac=1)
df.head()

Unnamed: 0,TaskTitle,ListTitle
8800,clean room bathroom,0
211,clean denture,0
4956,get salt water softener,2
1760,fix car cover,0
862,buy deo stick,2


In [187]:
#Saving a json file
with open('clean_df.json', 'w', encoding='utf-8') as json_file:
    json.dump(df.to_dict(orient='records'), json_file, ensure_ascii=False, indent=4)