In [None]:
import pandas as pd

# Feature Engineering Example

Price per weight

In [None]:
data = {"item_id": ["A1", "B1", "C2", "D3"],
        "price ($)": [95.4, 31.7, 52.5, 21.9],
        "weight (kg)": [12.2, 4.2, 9.3, 6.4]}
data = pd.DataFrame(data)
data

In [None]:
data["price per weight ($/kg)"] = data["price ($)"] / data["weight (kg)"]
data

Datetime

In [None]:
data = ['2021-08-19 13:58:08',
        '2021-10-13 10:40:05',
        '2021-03-22 09:59:51',
        '2021-01-18 16:04:28',
        '2021-10-03 00:31:52',
        '2021-04-24 16:39:15',
        '2021-04-14 06:05:49',
        '2021-01-18 12:04:23',
        '2021-05-12 23:47:47',
        '2021-02-10 18:01:14']
data = pd.DataFrame(data, columns = ["time"])
data["time"] = pd.to_datetime(data['time'], format='%Y-%m-%d %H:%M:%S')

In [None]:
data["year"] = data["time"].dt.year
data["month"] = data["time"].dt.month
data["date"] = data["time"].dt.day
data["days_since_first_covid"] = (data["time"] - pd.to_datetime("2020-03-02")).dt.days

data

In [None]:
pd.to_datetime("2022-06-11").dayofweek

data["day_of_week"] = data["time"].dt.dayofweek
data["is_weekend"] = [True if dow == 5 or dow == 6 else False for dow in data["day_of_week"] ]

data

Body Mass Index

In [None]:
data = {"weight (kg)": [73.84, 68.78, 74.11, 71.73,
                        69.88, 67.25, 68.78, 68.34, 67.01],
        "height (cm)": [201.89, 162.31, 212.74, 220.04,
                        206.34, 152.21, 183.92, 167.97, 175.92]}
data = pd.DataFrame(data)
data

In [None]:
data["BMI"] = data["weight (kg)"] / (data["height (cm)"]/100)**2
data

In [None]:
def get_bmi_class(bmi):
    if bmi <= 18.4:
        return "kurus"
    elif bmi <= 25:
        return "normal"
    else:
        return "gemuk"

data["class_BMI"] = data["BMI"].apply(lambda x: get_bmi_class(x))
data

# Case: Worker Stress Dataset

## Preparation

In [None]:
# Sources:
# https://medium.com/@ksnugroho/dasar-text-preprocessing-dengan-python-a4fa52608ffe
# https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk
# https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Download the dataset here:
# https://drive.google.com/file/d/1lI_keXUs1NwFrjL4wfQsQ3YeJGkYja7m/view?usp=sharing
df = pd.read_csv('data/data_sesi26.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
from collections import Counter
print(Counter(df["jenis_kelamin"]))
print(Counter(df["pekerjaan"]))
print(Counter(df["is_menikah"]))
print(Counter(df["is_merokok"]))
print(Counter(df["pendidikan"]))

In [None]:
df.isnull().sum()

In [None]:
fitur_numerikal = ["umur", "gaji", "berat", "tinggi"]
fitur_kategorikal = ["jenis_kelamin", "pekerjaan", "is_menikah", "is_merokok", "pendidikan"]

In [None]:
# Cek umur
df["umur"].plot.hist()

In [None]:
#Menghapus data orang dibawah umur legal
df = df[df["umur"]>18]

In [None]:
# Isi missing values
df[fitur_numerikal] = df[fitur_numerikal].fillna(df[fitur_numerikal].mean())
df[fitur_kategorikal] = df[fitur_kategorikal].fillna("Unknown")

In [None]:
df.isnull().sum()

## Feature Engineering

In [None]:
df.head()

In [None]:
df['kualitas_gaji'] = df['gaji']/(df['umur'])
df['bmi'] = df['berat']/((df['tinggi']/100)**2)

In [None]:
kat_bmi = []
for bmi in df["bmi"].values:
    if bmi < 17:
        kat_bmi.append("kurus")
    elif bmi < 25:
        kat_bmi.append("normal")
    else:
        kat_bmi.append("gemuk")
df['kategori_bmi'] = kat_bmi

In [None]:
df[["kualitas_gaji", "bmi", "kategori_bmi"]]

In [None]:
df.head()

In [None]:
Counter(df["ever_stress"])

In [None]:
X = df.drop(columns = ["ever_stress"])
y = df["ever_stress"]

In [None]:
# One hot encoding
X = pd.get_dummies(X)
X = X.fillna(0)

In [None]:
X.head()

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,
                                                 stratify=y,random_state = 123)

In [None]:
y_train.value_counts()

## Imbalanced Dataset Solution

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersampling = RandomUnderSampler(sampling_strategy=1)
X_under, y_under = undersampling.fit_resample(X_train,y_train)

In [None]:
Counter(y_under)

In [None]:
from imblearn.over_sampling import RandomOverSampler
oversampling = RandomOverSampler(sampling_strategy=1)
X_over, y_over = oversampling.fit_resample(X_train,y_train)

In [None]:
Counter(y_over)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 1)
X_smote, y_smote = smote.fit_resample(X_train,y_train)

In [None]:
Counter(y_smote)

In [None]:
X_smote.tail(10)

# Handling Text Data

## Cleaning

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Lowercase (Case Folding)
kalimat = "Berikut ini adalah 5 negara dengan pendidikan terbaik di dunia adalah Korea Selatan, Jepang, Singapura, Hong Kong, dan Finlandia."
lower_case = kalimat.lower()
print(lower_case)

In [None]:
# Menghilangkan symbol
import string
kalimat = "Ini &adalah [contoh] kalimat? {dengan} tanda. baca?!!"
hasil = kalimat.translate(str.maketrans("","",string.punctuation))
print(string.punctuation)
print(hasil)

In [None]:
# Hapus angka
# https://regexr.com/
import re # import modul regular expression
kalimat = "i have 32 shoes and 51 shirts"
# re.sub(regex_format, change_to_this, string)
hasil = re.sub(r"\d+", "", kalimat)
print(hasil)
print(hasil.split())
print(" ".join(hasil.split()))

In [None]:
# Translasi angka
import inflect
kalimat = "i have 3 shoes and 5 shirts"
p = inflect.engine()
hasil = []
for i in kalimat.split():
    if i.isnumeric():
        i = p.number_to_words(i)
    hasil.append(i)
hasil = " ".join(hasil)
print(hasil)

In [None]:
# Menghilangkan mention dan hashtag
kalimat = "halo @kawanku apa kabar @Teman? #pertemanan #TemanLama #Angkatan2013"
print(re.sub(r"#[A-Za-z0-9]+","",kalimat)) #tanpa hashtag
print(re.sub(r"@[A-Za-z0-9]+","",kalimat)) #tanpa mention

In [None]:
# Menghapus URL
kalimat = "bisa cek disini http://www.google.com gan"
print(re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", kalimat))

In [None]:
# Menghapus whitespace
kalimat = " \t ini kalimat contoh\t \n"
hasil = kalimat.strip()
print(hasil)

In [None]:
"bisa  cek   disini      gan".split()

In [None]:
" ".join("bisa  cek   disini  gan".split())

## Tokenizing

In [None]:
kalimat = "Andi kerap melakukan transaksi, rutin secara daring atau online."
pisah = kalimat.split()
print(pisah)

In [None]:
# import word_tokenize dari modul nltk
from nltk.tokenize import word_tokenize

kalimat = "Andi kerap melakukan transaksi, rutin secara daring atau online."

tokens = word_tokenize(kalimat)
print(tokens)

In [None]:
# import sent_tokenize dari modul nltk
from nltk.tokenize import sent_tokenize
kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online. \
           Menurut Andi belanja online lebih praktis & murah."

tokens = nltk.tokenize.sent_tokenize(kalimat)
print(tokens)

In [None]:
from nltk.corpus import stopwords

# Bersihkan text terlebih dahulu
kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online. Menurut Andi belanja online lebih praktis & murah."
kalimat = kalimat.translate(str.maketrans('','',string.punctuation)).lower() #hapus symbol dan mengubah huruf kapital

# Tokenizer
tokens = word_tokenize(kalimat)

listStopword = set(stopwords.words('indonesian'))
removed = []
stops = []
for t in tokens:
    if t in listStopword:
      stops.append(t)
    else:
      removed.append(t)
print(removed)
print(stops)


In [None]:
# list(listStopword)[100:120]

In [None]:
!pip install Sastrawi

In [None]:
# Stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online. Menurut Andi belanja online lebih praktis & murah."
hasil = stemmer.stem(kalimat)
print(hasil)

In [None]:
kalimat = "mendengarkan didengarkan mendengar dengarkan didengar"
hasil = stemmer.stem(kalimat)
print(hasil)

## Vectorization (Bag of Words and TF-IDF)

In [None]:
dt = ["We love dogs! :*",
      "We hate dogs and knitting :(",
      "Knitting is our hobby and our passion :)"]
dt = pd.DataFrame(dt, columns = ["text"])

In [None]:
dt

In [None]:
def cleaning_tokenizing(txt):
    txt = txt.lower() # menjadi huruf kecil semua
    txt = txt.translate(str.maketrans("","",string.punctuation)) #hapus symbol
    txt = nltk.tokenize.word_tokenize(txt) # tokenize
    return txt
dt["text"].apply(cleaning_tokenizing)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
text_counts = cv.fit_transform(dt["text"])
pd.DataFrame(text_counts.toarray(), columns = cv.get_feature_names_out())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
text_counts= cv.fit_transform(dt["text"])
pd.DataFrame(text_counts.toarray(), columns = cv.get_feature_names_out())