# Import Libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy
import sklearn
import nltk
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Import feature selection Libraries
from sklearn.feature_selection import SelectKBest, chi2, f_regression, mutual_info_classif

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
import math
%matplotlib inline
import seaborn as sns

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


-- Run Function  --

In [16]:
# Import and export data function
def import_data(folder, fileName):
    return pd.read_csv(folder+fileName+'.csv', index_col=0)

def export_data(df, namaFile):
    df.to_csv(namaFile+'.csv')

In [17]:
# Lokasi folder dataset
folder = 'C:/Users/ASUS/Documents/Learn Data Science/all_dataset/'

# import dataset original (belum diclean)
df_ori = import_data(folder, 'df_ori')
df_ori = df_ori.drop_duplicates() # drop duplikat

# pisahkan dataset berdasarkan label (1: depresi; 0:non-depresi)
df_ori_dep = df_ori[df_ori.label == 1]
df_ori_nonDep = df_ori[df_ori.label == 0]

In [18]:
# Tampilkan informasi banyak dataset total dan masing2 label
print("Banyak data original:", len(df_ori))
print("Banyak data depresi:", len(df_ori_dep))
print("Banyak data non-depresi:", len(df_ori_nonDep))

Banyak data original: 654
Banyak data depresi: 343
Banyak data non-depresi: 311


In [19]:
# Preview tampilan dataset
df_ori.sample(8)

Unnamed: 0,text,label
347,Like if I ever see an image of someone with t...,1
154,I put a refill in on Monday. It’s Thursday an...,1
104,Diagnosed with Bipolar in 2011. Had some setba...,0
291,My momâs drawing of my fiancÃ© and I. We hav...,0
532,Minnesota has seen the least amount of sun in ...,0
57,I literally dread going back to school after ...,1
534,Finally removed some people that made me deepl...,0
388,How does alcohol affect your PTSD?,1


# Preprocessing Data

Fungsi preprocessing

In [20]:
# Convert semua text menjadi lowercase
def text_to_lowerCase(text):
    return text.lower()

# Expand contractions
def expand_contractions(data):
    data = re.sub(r"\bdon't\b", 'do not', data)
    data = re.sub(r"\bdidn't\b", 'did not', data)
    data = re.sub(r"\bdoesn't\b", 'does not', data)
    data = re.sub(r"\bisn't\b", 'is not', data)
    data = re.sub(r"\baren't\b", 'are not', data)
    data = re.sub(r"\bwasn't\b", 'was not', data)
    data = re.sub(r"\bweren't\b", 'were not', data)

    data = re.sub(r"\bhadn't\b", 'had not', data)
    data = re.sub(r"\bhadn't've\b", 'had not have', data)
    data = re.sub(r"\bhasn't\b", 'has not', data)
    data = re.sub(r"\bhaven't\b", 'have not', data)

    data = re.sub(r"\bcan't\b", 'can not', data)
    data = re.sub(r"\bcan't've\b", 'cannot have', data)
    data = re.sub(r"\bcould've\b", 'could have', data)
    data = re.sub(r"\bcouldn't\b", 'could not', data)
    data = re.sub(r"\bcouldn't've\b", 'could not have', data)
    data = re.sub(r"\bshould've\b", 'should have', data)
    data = re.sub(r"\bshouldn't\b", 'should not', data)
    data = re.sub(r"\bshouldn't've\b", 'should not have', data)

    data = re.sub(r"\bi'll\b", 'i will', data)
    data = re.sub(r"\bi'll've\b", 'i will have', data)
    data = re.sub(r"\bi'm\b", 'i am', data)
    data = re.sub(r"\bi've\b", 'i have', data)
    data = re.sub(r"\bi'd\b", 'i would', data)
    data = re.sub(r"\bi'd've\b", 'i would have', data)

    data = re.sub(r"\by'all\b", 'you all', data)
    data = re.sub(r"\by'all're\b", 'you all are', data)
    data = re.sub(r"\byou're\b", 'you are', data)
    data = re.sub(r"\byou've\b", 'you have', data)
    data = re.sub(r"\byou'll\b", 'you will', data)
    data = re.sub(r"\byou'll've\b", 'you will have', data)
    data = re.sub(r"\byou'd\b", 'you would', data)
    data = re.sub(r"\byou'd've\b", 'you would have', data)

    data = re.sub(r"\bwe're\b", 'we re', data)
    data = re.sub(r"\bwe've\b", 'we have', data)
    data = re.sub(r"\bwe'll\b", 'we will', data)
    data = re.sub(r"\bwe'll've\b", 'we will have', data)
    data = re.sub(r"\bwe'd\b", 'we would', data)
    data = re.sub(r"\bwe'd've\b", 'we would have', data)

    data = re.sub(r"\bthey're\b", 'they are', data)
    data = re.sub(r"\bthey've\b", 'they have', data)
    data = re.sub(r"\bthey'll\b", 'they will', data)
    data = re.sub(r"\bthey'll've\b", 'they will have', data)
    data = re.sub(r"\bthey'd\b", 'they would', data)
    data = re.sub(r"\bthey'd've\b", 'they would have', data)

    data = re.sub(r"\bhe's\b", 'he is', data)
    data = re.sub(r"\bhe'd\b", 'he would', data)
    data = re.sub(r"\bhe'd've\b", 'he would have', data)
    data = re.sub(r"\bhe'll\b", 'he will', data)
    data = re.sub(r"\bhe'll've\b", 'he will have', data)

    data = re.sub(r"\bshe's\b", 'she is', data)
    data = re.sub(r"\bshe'd\b", 'she would', data)
    data = re.sub(r"\bshe'd've\b", 'she would have', data)
    data = re.sub(r"\bshe'll\b", 'she will', data)
    data = re.sub(r"\bshe'll've\b", 'she will have', data)

    data = re.sub(r"\bit's\b", 'it is', data)
    data = re.sub(r"\bit'd\b", 'it would', data)
    data = re.sub(r"\bit'd've\b", 'it would have', data)
    data = re.sub(r"\bit'll\b", 'it will', data)

    data = re.sub(r"\bthat's\b", 'that is', data)
    data = re.sub(r"\bthat'd\b", 'that would', data)
    data = re.sub(r"\bthat'd've\b", 'that would have', data)

    data = re.sub(r"\bthere's\b", 'there is', data)
    data = re.sub(r"\bthere'd\b", 'there would', data)
    data = re.sub(r"\bthere'd've\b", 'there would have', data)

    data = re.sub(r"\bwhat's\b", 'what is', data)
    data = re.sub(r"\bwhat're\b", 'what are', data)
    data = re.sub(r"\bwhat'd\b", 'what would', data)
    data = re.sub(r"\bwhat've\b", 'what have', data)

    data = re.sub(r"\bwhat's\b", 'when is', data)
    data = re.sub(r"\bwhat're\b", 'where is', data)
    data = re.sub(r"\bwhat'd\b", 'who is', data)
    data = re.sub(r"\bwhat've\b", 'who will', data)

    data = re.sub(r"\bwill've\b", 'will have', data)
    data = re.sub(r"\bwon't\b", 'will not', data)
    data = re.sub(r"\bwould've\b", 'would have', data)
    data = re.sub(r"\bwouldn't\b", 'would not', data)

    data = re.sub(r"\bhow'd\b", 'how did', data)
    data = re.sub(r"\bhow'd'yb", 'how do you', data)
    data = re.sub(r"\bhow'll\b", 'how will', data)
    data = re.sub(r"\bhow's\b", 'how is', data)

    data = re.sub(r"\bmight've\b", 'might have', data)
    data = re.sub(r"\bmightn't\b", 'might not', data)
    data = re.sub(r"\bmust've\b", 'must have', data)
    data = re.sub(r"\bmustn't\b", 'must not', data)
    data = re.sub(r"\bneedn't\b", 'need not', data)

    data = re.sub(r"\b'cause\b", 'because', data)
    data = re.sub(r"\blet's\b", 'let us', data)
    data = re.sub(r"\bo'clock\b", 'of the clock', data)

    data = data.split()
    data = " ".join(data)
    return data

# Remove URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

# Remove numbers
def remove_numbers(text):
    text = re.sub('\d+', '', text)
    return text

# Remove special characters
def remove_special_characters(text):
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    return text

# Remove extra whitespaces
def remove_extraspaces(text):
    return re.sub(' +', ' ', text)

# Tokenize text
def tokenized_text(text):
    return text.split()

# Kombinasi semua fungsi preprocessing
def cleaning(text):
    text = text_to_lowerCase(text)
    text = expand_contractions(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = remove_special_characters(text)
    text = remove_extraspaces(text)
    return text

# Hapus stopwords dari text pada dataset + hitung total teks
def del_sw(dataset, stop_words):
    df_tokens = dataset.text.apply(lambda x: x.split())
    df_tokens = df_tokens.apply(lambda x: [w for w in x if w not in list(stop_words)])

    dataset['text'] = list(df_tokens.apply(lambda x: ' '.join(x)))
    dataset['wc_count'] = dataset.text.apply(lambda x: len(str(x).split()))
    
    return dataset

<h3>4 Jenis dataset</h3>
<br>
<ul>
    <li>Dataset tanpa preprocessing + tanpa stopword removal: <b>df_ori</b> </li>
    <li>Dataset tanpa preprocessing + dengan stopword removal: <b>df_pre0</b> </li>
    <li>Dataset dengan preprocessing + tanpa stopword removal: <b>df_pre1</b> </li>
    <li>Dataset dengan preprocessing + dengan stopword removal: <b>df_pre2</b> </li>
</ul>

In [21]:
# Copy df_ori menjadi dataset baru yg akan dipreprocessing
df_pre0 = df_ori.copy() 
df_pre1 = df_ori.copy()

# Clean data df_pre1 menggunakan fungsi cleaning
df_pre0 = del_sw(df_pre0, stop_words)
df_pre1['text'] = df_ori.text.apply(cleaning)

# drop duplicate data
df_ori = df_ori.drop_duplicates()
df_pre0 = df_pre1.drop_duplicates()
df_pre1 = df_pre1.drop_duplicates()

Buat dfpre2 sebagai dataset baru dengan preprocessing dan dengan stopword removal

In [22]:
# Copy df_pre1 dan assign ke df_pre2 untuk delete stopwords
df_pre2 = df_pre1.copy()

# Delete stopwords from df_pre2
df_pre2 = del_sw(df_pre1, stop_words)

Ambil index dari df_pre2 untuk dicocokkan pada semua dataframe

In [23]:
# Ekstrak index dari semua data pada dfpre2 kecuali data dengan panjang kata kurang dari 3 
idx_all = df_pre2[df_pre2.wc_count >= 3].index

Cocokkan semua keempat dataset berdasarkan index dari dfpre2

In [24]:
# Sesuaikan semua dataFrame dengan idx_all
df_ori = df_ori.loc[idx_all]
df_pre0 = df_pre1.loc[idx_all]
df_pre1 = df_pre1.loc[idx_all]
df_pre2 = df_pre2.loc[idx_all]

-- Periksa panjang keempat dataset --

In [25]:
# Pastikan keempat panjang dataset sama 
print("Apakah semua panjang semua dataset sama?", len(df_ori) == len(df_pre0) == len(df_pre1) == len(df_pre2))

# Pastikan ketiga index dataset sama
print("Apakah semua index dataset sama?", list(df_ori.index) == list(df_pre0.index) == list(df_pre1.index) == list(df_pre2.index)) 

Apakah semua panjang semua dataset sama? True
Apakah semua index dataset sama? True


In [26]:
# Reset index keempat dataset
df_ori.reset_index(drop=True, inplace=True)
df_pre0.reset_index(drop=True, inplace=True)
df_pre1.reset_index(drop=True, inplace=True)
df_pre2.reset_index(drop=True, inplace=True)

-- Export semua dataset --

In [27]:
# Export all datasets
# export_data(df_ori, 'df_ori')
# export_data(df_pre0, 'df_pre0')
# export_data(df_pre1, 'df_pre1')
# export_data(df_pre2, 'df_pre2')

# Create new dataset with stemming and lemmatization

In [28]:
stemmer = PorterStemmer()
lemmatizer= WordNetLemmatizer()

-- Buat fungsi yang mereturn  dataset yg telah distemming dan lemma --

In [29]:
def stem_lemm(df):
    pre_tok = df.text.apply(lambda x: x.split())
    pre_tok_stem = pre_tok.apply(lambda x: [stemmer.stem(i) for i in x])
    pre_tok_lemm = pre_tok.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
    
    # Join tokens into dataframe
    df_pre_stem = pd.DataFrame(pre_tok_stem.apply(lambda x: ' '.join(x)))
    df_pre_lemm = pd.DataFrame(pre_tok_lemm.apply(lambda x: ' '.join(x)))

    # Add labels into each dataframe
    df_pre_stem['label'] = list(df.label)
    df_pre_lemm['label'] = list(df.label)
    
    return [df_pre_stem, df_pre_lemm]

In [30]:
# Stemming dan lemma masing masing dataset yang telah dipreprocessing (df_pre1 & df_pre2)
pre1_group = stem_lemm(df_pre1)
pre2_group = stem_lemm(df_pre1)

[0] = data yang telah di-stemmed
<br>
[1] = data yang telah di-lemmatized

-- Export keempat dataset yang telah di-stemming dan lemma

In [31]:
# Export all_dataframes
# export_data(pre1_group[0], 'pre1_stemmed')
# export_data(pre1_group[1], 'pre1_lemma')

# export_data(pre2_group[0], 'pre2_stemmed')
# export_data(pre2_group[1], 'pre2_lemma')

# Split into train and test with ratio 70:30

In [32]:
# Buat lokasi dataset
folder = 'C:/Users/ASUS/Documents/Learn Data Science/all_dataset/'

# Import keempat jenis dataset
df_ori = pd.read_csv(folder+'/df_ori.csv', index_col=0)
df_pre0 = pd.read_csv(folder+'/df_pre1.csv', index_col=0)
df_pre1 = pd.read_csv(folder+'/df_pre1.csv', index_col=0)
df_pre2 = pd.read_csv(folder+'/df_pre2.csv', index_col=0)

# Import masing2 dataset yang telah di-stemming dan lemma
pre1_stemmed = pd.read_csv(folder+'/pre1_stemmed.csv', index_col=0)
pre1_lemma = pd.read_csv(folder+'/pre1_lemma.csv', index_col=0)

pre2_stemmed = pd.read_csv(folder+'/pre2_stemmed.csv', index_col=0)
pre2_lemma = pd.read_csv(folder+'/pre2_lemma.csv', index_col=0)

Split dataset df_ori menjadi train dan test sesuai ratio

In [33]:
# Import libraries
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
import math

In [34]:
# Ambil values dari masing masing kolom
X = df_ori['text'].values
y = df_ori['label'].values

# Split masing2 values menjadi train dan test
X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
# Slice df_ori menjadi dua dataset (train&test) berdasarkan hasil split text
train_ori = df_ori[df_ori.text.isin(X_train_ori)]
test_ori = df_ori[df_ori.text.isin(X_test_ori)]

# Ambil index dataset hasil split
idx_train = list(train_ori.index)
idx_test = list(test_ori.index)

# Split 3 jenis dataset lainnya berdasarkan index train&test df_ori
train_pre0 = df_pre0.iloc[idx_train]
test_pre0 = df_pre0.iloc[idx_test]

train_pre1 = df_pre1.iloc[idx_train]
test_pre1 = df_pre1.iloc[idx_test]

train_pre2 = df_pre2.iloc[idx_train]
test_pre2 = df_pre2.iloc[idx_test]

train_pre1_stemmed = pre1_stemmed.iloc[idx_train]
test_pre1_stemmed = pre1_stemmed.iloc[idx_test]

train_pre1_lemma = pre1_lemma.iloc[idx_train]
test_pre1_lemma = pre1_lemma.iloc[idx_test]

train_pre2_stemmed = pre2_stemmed.iloc[idx_train]
test_pre2_stemmed = pre2_stemmed.iloc[idx_test]

train_pre2_lemma = pre2_lemma.iloc[idx_train]
test_pre2_lemma = pre2_lemma.iloc[idx_test]

-- Export semua dataset hasil split --

In [36]:
# Export all train and test datasets
# export_data(train_ori, 'train_ori')
# export_data(test_ori, 'test_ori')

# export_data(train_pre0, 'train_pre0')
# export_data(test_pre0, 'test_pre0')

# export_data(train_pre1, 'train_pre1')
# export_data(test_pre1, 'test_pre1')

# export_data(train_pre1_stemmed, 'train_pre1_stemmed')
# export_data(test_pre1_stemmed, 'test_pre1_stemmed')

# export_data(train_pre1_lemma, 'train_pre1_lemma')
# export_data(test_pre1_lemma, 'test_pre1_lemma')

# export_data(train_pre2, 'train_pre2')
# export_data(test_pre2, 'test_pre2')

# export_data(train_pre2_stemmed, 'train_pre2_stemmed')
# export_data(test_pre2_stemmed, 'test_pre2_stemmed')

# export_data(train_pre2_lemma, 'train_pre2_lemma')
# export_data(test_pre2_lemma, 'test_pre2_lemma')

In [37]:
loc = 'D:/College Stuff/Implementasi TA/'
dummy_data = pd.read_csv(loc+'data_dummy2.csv', header=None, delimiter=';', names=['text', 'label'])
dummy_data

Unnamed: 0,text,label
0,"Just wanted to say I am here for anybody, plea...",1
1,I'm here because nobody else gives a shit. Dia...,1
2,My old man is so happy about completing this c...,0
3,After years of struggling with depression and ...,0
4,I’ve been struggling w my mental health lately...,0
