# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy
import sklearn
import math
%matplotlib inline
import seaborn as sns
from collections import Counter, defaultdict

# Text Library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Splitting Data
from sklearn.model_selection import train_test_split

# Import libary for TFID Vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import feature selection Libraries
from sklearn.feature_selection import SelectKBest, chi2, f_regression, mutual_info_classif

# Algorithm
from sklearn.naive_bayes import MultinomialNB

# Evaluation
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Buat semua fungsi yang akan digunakan

In [2]:
# Import dataset
def import_data(folder, namaFile):
    return pd.read_csv(folder+namaFile+'.csv', index_col=0)
# Export dataset
def export_data(dataset, namaFile):
    return dataset.to_csv(namaFile+'.csv')

In [3]:
# Ekstrak fitur dari dataset; Output berupa dataframe dengan fitur dan frekuensinya
def ext_feature(dataset):
    tokens = dataset.text.apply(lambda x: x.split())
    words_df = pd.DataFrame(pd.Series([w for ws in list(tokens) for w in ws]).value_counts())
    words_df.reset_index(inplace=True)
    words_df.sort_values(by='index', inplace=True)
    words_df.reset_index(drop=True, inplace=True)
    words_df.rename(columns={'index':'word', 0:'freq'}, inplace=True)
    return words_df

# Buat dataset fitur semua kata dari kedua jenis data tanpa duplikat
def get_all_w(dep_word, nonDep_word):
    all_words = list(set(list(dep_word.word) + list(nonDep_word.word)))
    all_words = pd.DataFrame(all_words, columns=['word']).sort_values(by='word')
    all_words.reset_index(drop=True, inplace=True)
    return all_words

# Input: Dataset; Output: Dataframe berisi fitur2 & banyak dokumen dari masing2 label yang terdapat kata 
def ext_final_ft(dataset):
    # Ambil fitur dari dataset dan pisahkan berdasarkan labelnya
    data_dep = ext_feature(dataset[dataset.label == 1])
    data_nonDep = ext_feature(dataset[dataset.label == 0])
    
    # Ambil fitur kata dari kedua jenis dataset dan assign frekuensi = 0
    all_words = get_all_w(data_dep, data_nonDep)
    all_words['freq_dep'] = 0
    all_words['freq_nonDep'] = 0

    # Ambil index dari fitur kedua jenis dataset
    dep_idx = list(all_words[all_words.word.isin(list(data_dep.word))].index)
    nonDep_idx = list(all_words[all_words.word.isin(list(data_nonDep.word))].index)
    
    # Assign frekuensi ke masing masing kata
    all_words.loc[dep_idx, 'freq_dep'] = list(data_dep.freq)
    all_words.loc[nonDep_idx, 'freq_nonDep'] = list(data_nonDep.freq)
    
    return all_words

Import Semua Dataset

In [4]:
# Tentukan lokasi dataset
folder = 'C:/Users/ASUS/Documents/Learn Data Science/all_dataset/'
# Import keempat jenis dataset
df_ori = import_data(folder, 'df_ori')
df_pre0 = import_data(folder, 'df_pre0')
df_pre1 = import_data(folder, 'df_pre1')
df_pre2 = import_data(folder, 'df_pre2')

# Tentukan lokasi dataset train&test masing2 dataset
ext_f = folder+'train_test/'

train_ori = import_data(ext_f, '/ori/train_ori')
test_ori = import_data(ext_f, '/ori/test_ori')

train_pre0 = import_data(ext_f, '/pre0/train_pre0')
test_pre0 = import_data(ext_f, '/pre0/test_pre0')

train_pre1 = import_data(ext_f, '/pre1/train_pre1')
test_pre1 = import_data(ext_f, '/pre1/test_pre1')

train_pre2 = import_data(ext_f, '/pre2/train_pre2')
test_pre2 = import_data(ext_f, '/pre2/test_pre2')

# pre1 stem and lemma
train_pre1_stemmed = import_data(ext_f, '/pre1_stemmed/train_pre1_stemmed')
test_pre1_stemmed = import_data(ext_f, '/pre1_stemmed/test_pre1_stemmed')

train_pre1_lemma = import_data(ext_f, '/pre1_lemma/train_pre1_lemma')
test_pre1_lemma = import_data(ext_f, '/pre1_lemma/test_pre1_lemma')

# pre2 stem and lemma
train_pre2_stemmed = import_data(ext_f, '/pre2_stemmed/train_pre2_stemmed')
test_pre2_stemmed = import_data(ext_f, '/pre2_stemmed/test_pre2_stemmed')

train_pre2_lemma = import_data(ext_f, '/pre2_lemma/train_pre2_lemma')
test_pre2_lemma = import_data(ext_f, '/pre2_lemma/test_pre2_lemma')

In [5]:
list(train_ori[train_ori.label==0].sample(frac=0.5).head(1).text)

["I'm a final Semester MBA student who just got placed in his first job with an annual 600,000 USD package at a firm specializing in Artificial Intelligence."]

# Ekstrak semua fitur

-- Ekstrak semua fitur kata dari train&test masing2 dataset --

In [6]:
# df_ori
wrd_train_ori = ext_final_ft(train_ori)
wrd_test_ori = ext_final_ft(test_ori)

# df_pre0
wrd_train_pre0 = ext_final_ft(train_pre0)
wrd_test_pre0 = ext_final_ft(test_pre0)

# df_pre1
wrd_train_pre1 = ext_final_ft(train_pre1)
wrd_test_pre1 = ext_final_ft(test_pre1)

# df_pre2
wrd_train_pre2 = ext_final_ft(train_pre2)
wrd_test_pre2 = ext_final_ft(test_pre2)

#========= stemmed and lemma ============#

# df_pre1_stemmed
wrd_train_pre1_stemmed = ext_final_ft(train_pre1_stemmed)
wrd_test_pre1_stemmed = ext_final_ft(test_pre1_stemmed)

# df_pre1_lemma
wrd_train_pre1_lemma = ext_final_ft(train_pre1_lemma)
wrd_test_pre1_lemma= ext_final_ft(test_pre1_lemma)

# df_pre2_stemmed
wrd_train_pre2_stemmed = ext_final_ft(train_pre2_stemmed)
wrd_test_pre2_stemmed = ext_final_ft(test_pre2_stemmed)

# df_pre2_lemma
wrd_train_pre2_lemma = ext_final_ft(train_pre2_lemma)
wrd_test_pre2_lemma= ext_final_ft(test_pre2_lemma)

# Export semua fitur kata menjadi file csv

In [7]:
m_name = 'word_'

# df_ori
# export_data(wrd_train_ori, m_name+'train_ori')
# export_data(wrd_test_ori, m_name+'test_ori')

# # df_pre1
# export_data(wrd_train_pre1, m_name+'train_pre1') 
# export_data(wrd_test_pre1, m_name+'test_pre1')  

# # df_pre2
# export_data(wrd_train_pre2, m_name+'train_pre2')
# export_data(wrd_test_pre2, m_name+'test_pre2')

# #========= stemmed and lemma ============#

# # df_pre1_stemmed
# export_data(wrd_train_pre1_stemmed, m_name+'train_pre1_stemmed')
# export_data(wrd_test_pre1_stemmed , m_name+'test_pre1_stemmed') 

# # df_pre1_lemma
# export_data(wrd_train_pre1_lemma, m_name+'train_pre1_lemma') 
# export_data(wrd_test_pre1_lemma, m_name+'test_pre1_lemma') 

# # df_pre2_stemmed
# export_data(wrd_train_pre2_stemmed, m_name+'train_pre2_stemmed') 
# export_data(wrd_test_pre2_stemmed, m_name+'test_pre2_stemmed')

# # df_pre2_lemma
# export_data(wrd_train_pre2_lemma, m_name+'train_pre2_lemma')
# export_data(wrd_test_pre2_lemma, m_name+'test_pre2_lemma')

# Ekstrak fitur tfidf dari masing masing dataset

In [8]:
# Convert dataset menjadi dataset dengan fitur2 tfidf
def conv_vector(dataset):
    X_data = dataset['text'].values
    y_data = dataset['label'].values

    # TFIDF Feature Matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, lowercase = False, stop_words=None)
    # Fit transform to data train
    tfidf_data = tfidf_vectorizer.fit_transform(X_data)
    
    # get the first vector out (for the first document)
    first_vector_tfidfvectorizer=tfidf_data[0]

    # place tf-idf values in a pandas data frame
    df_fitur = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])

    df_fitur.reset_index(level=0, inplace=True)
    df_fitur.columns = ['feature', 'tfidf']
    
    return df_fitur

 -- Ekstrak fitur tfidf untuk masing masing train dan test dataset --

In [9]:
# tfidf feature from train_ori
tidf_train_ori = conv_vector(train_ori)
tidf_train_ori_dep = conv_vector(train_ori[train_ori.label==1])
tidf_train_ori_nonDep = conv_vector(train_ori[train_ori.label==0])

# tfidf feature from train_pre1
tidf_train_pre1 = conv_vector(train_pre1)
tidf_train_pre1_dep = conv_vector(train_pre1[train_pre1.label==1])
tidf_train_pre1_nonDep = conv_vector(train_pre1[train_pre1.label==0])

# tfidf feature from train_pre2
tidf_train_pre2 = conv_vector(train_pre2)
tidf_train_pre2_dep = conv_vector(train_pre2[train_pre2.label==1])
tidf_train_pre2_nonDep = conv_vector(train_pre2[train_pre2.label==0])


#========= stemmed and lemma ============#

# tfidf feature from train_pre1_stemmed
tidf_train_pre1_stemmed = conv_vector(train_pre1_stemmed)
tidf_train_pre1_stemmed_dep = conv_vector(train_pre1_stemmed[train_pre1_stemmed.label==1])
tidf_train_pre1_stemmed_nonDep = conv_vector(train_pre1_stemmed[train_pre1_stemmed.label==0])

# tfidf feature from train_pre1_lemma
tidf_train_pre1_lemma = conv_vector(train_pre1_lemma)
tidf_train_pre1_lemma_dep = conv_vector(train_pre1_lemma[train_pre1_lemma.label==1])
tidf_train_pre1_lemma_nonDep = conv_vector(train_pre1_lemma[train_pre1_lemma.label==0])

# tfidf feature from train_pre2_stemmed
tidf_train_pre2_stemmed = conv_vector(train_pre2_stemmed)
tidf_train_pre2_stemmed_dep = conv_vector(train_pre2_stemmed[train_pre2_stemmed.label==1])
tidf_train_pre2_stemmed_nonDep = conv_vector(train_pre2_stemmed[train_pre2_stemmed.label==0])

# tfidf feature from train_pre2_lemma
tidf_train_pre2_lemma = conv_vector(train_pre2_lemma)
tidf_train_pre2_lemma_dep = conv_vector(train_pre2_lemma[train_pre2_lemma.label==1])
tidf_train_pre2_lemma_nonDep = conv_vector(train_pre2_lemma[train_pre2_lemma.label==0])

In [12]:
tidf_train_pre2.shape

(3376, 2)

In [10]:
# Preview dataframe fitur tfidf train_pre1
tidf_train_pre1.sample(5)

Unnamed: 0,feature,tfidf
1106,fed,0.0
510,chatting,0.0
65,affects,0.0
1305,gpa,0.0
3340,wanting,0.0


# Export all tfidf features to csv file

In [11]:
m_name = 'C:/Users/ASUS/Documents/Learn Data Science/all_dataset/tfidf_features/fitur_'

# tfidf feature from train_ori
# export_data(tidf_train_ori, m_name+'ori')
# export_data(tidf_train_ori_dep, m_name+'ori_dep')
# export_data(tidf_train_ori_nonDep, m_name+'ori_nonDep')

# # tfidf feature from train_pre1
# export_data(tidf_train_pre1, m_name+'pre1')
# export_data(tidf_train_pre1_dep, m_name+'pre1_dep')
# export_data(tidf_train_pre1_nonDep, m_name+'pre1_nonDep')

# # tfidf feature from train_pre2
# export_data(tidf_train_pre2, m_name+'pre2')
# export_data(tidf_train_pre2_dep, m_name+'pre2_dep')
# export_data(tidf_train_pre2_nonDep, m_name+'pre2_nonDep')
 
# # ========= stemmed and lemma ============ #

# # tfidf feature from train_pre1_stemmed
# export_data(tidf_train_pre1_stemmed, m_name+'pre1_stemmed')
# export_data(tidf_train_pre1_stemmed_dep, m_name+'pre1_stemmed_dep')
# export_data(tidf_train_pre1_stemmed_nonDep, m_name+'pre1_stemmed_nonDep')

# # tfidf feature from train_pre1_lemma
# export_data(tidf_train_pre1_lemma, m_name+'pre1_lemma')
# export_data(tidf_train_pre1_lemma_dep, m_name+'pre1_lemma_dep')
# export_data(tidf_train_pre1_lemma_nonDep, m_name+'pre1_lemma_nonDep')

# # tfidf feature from train_pre2_stemmed
# export_data(tidf_train_pre2_stemmed, m_name+'pre2_stemmed')
# export_data(tidf_train_pre2_stemmed_dep, m_name+'pre2_stemmed_dep')
# export_data(tidf_train_pre2_stemmed_nonDep, m_name+'pre2_stemmed_nonDep')

# # tfidf feature from train_pre2_lemma
# export_data(tidf_train_pre2_lemma, m_name+'pre2_lemma')
# export_data(tidf_train_pre2_lemma_dep, m_name+'pre2_lemma_dep')
# export_data(tidf_train_pre2_lemma_nonDep, m_name+'pre2_lemma_nonDep')