# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn
import math
%matplotlib inline
import seaborn as sns
from collections import Counter, defaultdict

# Text Library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
# Splitting Data
from sklearn.model_selection import train_test_split

# Import libary for TFID Vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import feature selection Libraries
from sklearn.feature_selection import SelectKBest, chi2, f_regression, mutual_info_classif

# Algorithm
from sklearn.naive_bayes import MultinomialNB

# Evaluation

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


-- Run Functions --

In [2]:
# Import dataset
def import_data(folder, fileName):
    return pd.read_csv(folder+fileName+'.csv', index_col=0)
# Export dataset
def export_data(dataset, fileName):
    return dataset.to_csv(fileName+'.csv')

In [3]:
# classify data based on train and test with alpha value
def classify_data(train_data, test_data, alpha_value=0):
    # Separate text and label data for train and test
    X_train = train_data['text'].values
    y_train = train_data['label'].values

    X_test = test_data['text'].values
    y_test = test_data['label'].values

    # TFIDF vectorizer 
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, lowercase = False, stop_words=None)

    # Fit transform tfidf_vector to train and test data
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    # Call model from lib based on alpha_value
    if alpha_value == 0:
        mnb = MultinomialNB()
    else:
        mnb = MultinomialNB(alpha=alpha_value)
        
    # training the model to train and test vector
    time_s, clock_s = time.time(), time.clock()
    mnbTfidf = mnb.fit(tfidf_train, y_train) 
    predictTfidf = mnbTfidf.predict(tfidf_test)
    time_run = time.time() - time_s
    clock_run = time.clock() - clock_s
    
    # return array containing test data and predicted labels
    return [X_test, y_test, predictTfidf, [time_run, clock_run]]

In [4]:
# Get right and wrong classified results data
def classified_results(X_test, y_test, predictLabels):
    # Initialize lists for right and wrong data
    wrongText, wrongLabel = [], []
    rightText, rightLabel = [], []
    
    # iterate through redicted labels and add to initialized lists
    for i in range(0,len(predictLabels)):
        if(predictLabels[i] != y_test[i]):
          wrongText.append(X_test[i])
          wrongLabel.append(y_test[i])
        else:
          rightText.append(X_test[i])
          rightLabel.append(y_test[i])
    # Create dataframe from appended lists
    dfWrong = pd.DataFrame(
      {'text': wrongText,
      'label': wrongLabel
      })

    dfRight = pd.DataFrame(
      {'text': rightText,
      'label': rightLabel
      })
    # Return right and wrong classified dataframe
    return [dfRight, dfWrong]

# Find accuracy of predicted data
def find_evaluation(y_test, predictTfidf):
    return [accuracy_score(y_test,predictTfidf), f1_score(y_test, predictTfidf), recall_score(y_test, predictTfidf)]

# Import Semua Data

In [5]:
# Folder location of dataset
folder = 'C:/Users/ASUS/Documents/Learn Data Science/train_test/'

# dataset df_pre1
train_pre1 = import_data(folder, 'pre1/train_pre1')
test_pre1 = import_data(folder, 'pre1/test_pre1')

# dataset df_pre2
train_pre2 = import_data(folder, 'pre2/train_pre2')
test_pre2 = import_data(folder, 'pre2/test_pre2')

# === Stemmed and Lemma dataset ===

# dataset df_pre1_stemmed
train_pre1_stemmed = import_data(folder, 'pre1_stemmed/train_pre1_stemmed')
test_pre1_stemmed = import_data(folder, 'pre1_stemmed/test_pre1_stemmed')

# dataset df_pre1_lemma
train_pre1_lemma = import_data(folder, 'pre1_lemma/train_pre1_lemma')
test_pre1_lemma = import_data(folder, 'pre1_lemma/test_pre1_lemma')

# dataset df_pre2_stemmed
train_pre2_stemmed = import_data(folder, 'pre2_stemmed/train_pre2_stemmed')
test_pre2_stemmed = import_data(folder, 'pre2_stemmed/test_pre2_stemmed')

# dataset df_pre2_lemma
train_pre2_lemma = import_data(folder, 'pre2_lemma/train_pre2_lemma')
test_pre2_lemma = import_data(folder, 'pre2_lemma/test_pre2_lemma')

# Classify each dataset and Find Accuracy

<h4>Variable info:</h4><br>
1. predList_ = Get predicted data<br>
2. eval_ = Get Evaluation score [Accuracy, Recall] of data

train_test_list = a list of lists that contain train and test data<br>
train-> index 0; test-> index 1;<br>

In [6]:
# rs_pre1 = classified_results(predList_pre1[0], predList_pre1[1], predList_pre1[2])
# rs_pre2 = classified_results(predList_pre2[0], predList_pre2[1], predList_pre2[2])
# rs_pre1_stemmed = classified_results(predList_pre1_stemmed[0], predList_pre1_stemmed[1], predList_pre1_stemmed[2])
# rs_pre1_lemma = classified_results(predList_pre1_lemma[0], predList_pre1_lemma[1], predList_pre1_lemma[2])
# rs_pre2_stemmed = classified_results(predList_pre2_stemmed[0], predList_pre2_stemmed[1], predList_pre2_stemmed[2])
# rs_pre2_lemma = classified_results(predList_pre2_lemma[0], predList_pre2_lemma[1], predList_pre2_lemma[2])

In [7]:
# Initialize accuracy and recall list 
acc_list, f1score_list, recall_list = [], [], []

# Classify df_pre1
predList_pre1 = classify_data(train_pre1, test_pre1, 0)
eval_pre1 = find_evaluation(predList_pre1[1], predList_pre1[2])

# Classify df_pre2
predList_pre2 = classify_data(train_pre2, test_pre2, 0)
eval_pre2 = find_evaluation(predList_pre2[1], predList_pre2[2])

# ==== Stemmed and Lemma dataset  ====
# Classify df_pre1_stemmed
predList_pre1_stemmed = classify_data(train_pre1_stemmed, test_pre1_stemmed, 0)
eval_pre1_stemmed = find_evaluation(predList_pre1_stemmed[1], predList_pre1_stemmed[2])

# Classify df_pre1_lemma
predList_pre1_lemma = classify_data(train_pre1_lemma, test_pre1_lemma, 0)
eval_pre1_lemma = find_evaluation(predList_pre1_lemma[1], predList_pre1_lemma[2])

# Classify df_pre2_stemmed
predList_pre2_stemmed = classify_data(train_pre2_stemmed, test_pre2_stemmed, 0)
eval_pre2_stemmed = find_evaluation(predList_pre2_stemmed[1], predList_pre2_stemmed[2])

# Classify df_pre2_lemma
predList_pre2_lemma = classify_data(train_pre2_lemma, test_pre2_lemma, 0)
eval_pre2_lemma = find_evaluation(predList_pre2_lemma[1], predList_pre2_lemma[2])

# Assign all variables into all_eval list
all_eval = [
    eval_pre1, eval_pre2,
    eval_pre1_stemmed, eval_pre1_lemma,
    eval_pre2_stemmed, eval_pre2_lemma
]

# Get accuracy and recall and assign those into the respective list
acc_list = [round(x[0]*100,2) for x in all_eval]
f1score_list = [round(x[1]*100,2) for x in all_eval]
recall_list = [round(x[2]*100,2) for x in all_eval]

In [8]:
# Output accuracy and recall from each dataset
nameList = ['_pre1', '_pre2', '_pre1_stemmed', '_pre1_lemma', '_pre2_stemmed', '_pre2_lemma']
print("Model Accuracy from each datasets\n")
for i in range (len(all_eval)):
    print('Accuracy'+nameList[i],':',acc_list[i])
print("\n")
print("Model F1_Score from each datasets\n")
for i in range (len(all_eval)):
    print('Recall'+nameList[i],':',f1score_list[i])
print("\n")
print("Model Recall from each datasets\n")
for i in range (len(all_eval)):
    print('Recall'+nameList[i],':',recall_list[i])

Model Accuracy from each datasets

Accuracy_pre1 : 73.1
Accuracy_pre2 : 82.23
Accuracy_pre1_stemmed : 76.65
Accuracy_pre1_lemma : 74.11
Accuracy_pre2_stemmed : 84.26
Accuracy_pre2_lemma : 82.74


Model F1_Score from each datasets

Recall_pre1 : 80.0
Recall_pre2 : 85.83
Recall_pre1_stemmed : 82.17
Recall_pre1_lemma : 80.61
Recall_pre2_stemmed : 87.14
Recall_pre2_lemma : 86.07


Model Recall from each datasets

Recall_pre1 : 100.0
Recall_pre2 : 100.0
Recall_pre1_stemmed : 100.0
Recall_pre1_lemma : 100.0
Recall_pre2_stemmed : 99.06
Recall_pre2_lemma : 99.06


In [57]:
# Export classified results into csv file
# rs_lists = [
#     rs_pre1, rs_pre2, 
#     rs_pre1_stemmed, rs_pre1_lemma,
#     rs_pre2_stemmed, rs_pre2_lemma,
# ]

# wc_name= 'wc_'
# for i in range(len(rs_lists)):
#     export_data(rs_lists[i][1], wc_name+listNama[i])

# Find accuracy by tuning smoothing parameter

In [59]:
# Define alpha values
alpha_arr = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

-- Get accuracy and wrong classified of each dataset -- 

In [62]:
# Define list to get model accuracy and wrong classified data from df_pre2
acc_arr_pre2, f1score_arr_pre2, recall_arr_pre2, wr_classified_pre2 = ([] for i in range(4))

# Iterate through alpha list to get each predicted data, classified results and accuracy model of df_pre2
for item in alpha_arr:
    predList_pre2 = classify_data(train_pre2, test_pre2, item)
    rs_pre2 = classified_results(predList_pre2[0], predList_pre2[1], predList_pre2[2])
    eval_pre2 = find_evaluation(predList_pre2[1], predList_pre2[2])
    
    # Append accuracy and wrong classified data into lists
    acc_arr_pre2.append(eval_pre2[0])
    f1score_arr_pre2.append(eval_pre2[1])
    recall_arr_pre2.append(eval_pre2[2])
    wr_classified_pre2.append(rs_pre2[1])

In [64]:
print("Accuracy pada data dengan tuning parameter alpha:", [round(x*100, 2) for x in acc_arr_pre2])
print("F1 Score pada data dengan tuning parameter alpha:", [round(x*100, 2) for x in f1score_arr_pre2])
print("Recall pada data dengan tuning parameter alpha:", [round(x*100, 2) for x in recall_arr_pre2])

Accuracy pada data dengan tuning parameter alpha: [80.2, 81.73, 82.74, 84.77, 82.23, 73.1, 53.81, 53.81, 53.81]
F1 Score pada data dengan tuning parameter alpha: [84.46, 85.48, 86.07, 87.39, 85.83, 80.0, 69.97, 69.97, 69.97]
Recall pada data dengan tuning parameter alpha: [100.0, 100.0, 99.06, 98.11, 100.0, 100.0, 100.0, 100.0, 100.0]


-- Output lists accuracy of df_pre1 & df_pre2 with respective alpha --

In [14]:
# Get max index of accuracy list and match into alpha list
max_idx_pre2 = acc_arr_pre2.index(max(acc_arr_pre2))

# Show which alpha that yields max accuracy
print("Alpha dengan akurasi maksimum :", alpha_arr[max_idx_pre2])

Alpha dengan akurasi maksimum : 0.1


-- Show how many wrong classified data from each alpha value --

In [15]:
# Wrong classified data from df_pre2
print("\nBanyak masing masing wrong_classified pada pre2:")
for i in range(len(wr_classified_pre2)):
    print("Alpha",alpha_arr[i],":",len(wr_classified_pre2[i]))


Banyak masing masing wrong_classified pada pre2:
Alpha 0.0001 : 39
Alpha 0.001 : 36
Alpha 0.01 : 34
Alpha 0.1 : 30
Alpha 1 : 35
Alpha 10 : 53
Alpha 100 : 91
Alpha 1000 : 91
Alpha 10000 : 91


In [16]:
# Export wrong classified data from each dataset into csv file
# for i in range(len(wr_classified_pre1)):
#     export_data(wr_classified_pre1[i], 'wc_pre1_'+str(i))
#     export_data(wr_classified_pre2[i], 'wc_pre2_'+str(i))

# Analisis Alpha dan Misclassified

-- Run Functions --

In [17]:
# Extract words feature from dataset
def ext_feature(dataset):
    tokens = dataset.text.apply(lambda x: x.split())
    words_df = pd.DataFrame(pd.Series([w for ws in list(tokens) for w in ws]).value_counts())
    words_df.reset_index(inplace=True)
    words_df.sort_values(by='index', inplace=True)
    words_df.reset_index(drop=True, inplace=True)
    words_df.rename(columns={'index':'word', 0:'freq'}, inplace=True)
    return words_df

# Create dataframe containing words from each data labels without duplicate words
def get_all_w(dep_word, nonDep_word):
    all_words = list(set(list(dep_word.word) + list(nonDep_word.word)))
    all_words = pd.DataFrame(all_words, columns=['word']).sort_values(by='word')
    all_words.reset_index(drop=True, inplace=True)
    return all_words

# Create dataframe containing words feature with frequency on each labels
def ext_final_ft(dataset):
    data_dep = ext_feature(dataset[dataset.label == 1])
    data_nonDep = ext_feature(dataset[dataset.label == 0])
    
    all_words = get_all_w(data_dep, data_nonDep)
    all_words['freq_dep'] = 0
    all_words['freq_nonDep'] = 0

    # Ambil index setiap feature dep dan non dep
    dep_idx = list(all_words[all_words.word.isin(list(data_dep.word))].index)
    nonDep_idx = list(all_words[all_words.word.isin(list(data_nonDep.word))].index)

    all_words.loc[dep_idx, 'freq_dep'] = list(data_dep.freq)
    all_words.loc[nonDep_idx, 'freq_nonDep'] = list(data_nonDep.freq)
    
    return all_words

-- Show info of wrong classified data --

<h4>Variable info:</h4><br>
1. wr_to_right: wrong classified data into right classified after change of alpha<br>
2. still_wrong: wrong classified data still in wrong classified after change of alpha<br>
3. new_wrong: newly wrong classified data after change of alpha

In [18]:
# wr_to_right_pre2
print("========= Keterangan pada df_pre2 =========")
print("Banyak wr_classified dengan alpha=1:", len(wr_classified_pre2[4]))
print("Banyak wr_classified dengan alpha=0.1:", len(wr_classified_pre2[3]))

# Get each wrong classified from df_pre2
wr_to_right_pre2 = [x for x in list(wr_classified_pre2[4].text) if x not in list(wr_classified_pre2[3].text)]
still_wrong_pre2 = [x for x in list(wr_classified_pre2[4].text) if x in list(wr_classified_pre2[3].text)]
new_wrong_pre2 = [x for x in list(wr_classified_pre2[3].text) if x not in list(wr_classified_pre2[4].text)]

print("\nBanyak data yang menjadi benar:", len(wr_to_right_pre2))
print("Banyak data yang tetap salah:", len(still_wrong_pre2))
print("Banyak data salah yang baru muncul:", len(new_wrong_pre2))

Banyak wr_classified dengan alpha=1: 35
Banyak wr_classified dengan alpha=0.1: 30

Banyak data yang menjadi benar: 11
Banyak data yang tetap salah: 24
Banyak data salah yang baru muncul: 6


In [19]:
# Dataset location
folder = 'D:/College Stuff/Implementasi TA/new_dataset/tfidf_features/'

# Import each feature of df_pre1&df_pre2
fitur_pre2 = import_data(folder, 'pre2/fitur_pre2')

-- Get out-of-vocab (oov) from each test data --

In [20]:
# Extract words feature from each test data
voc_test_pre2 = ext_final_ft(test_pre2)

# Get out-of-vocab words from each test data
oov_pre2 = [x for x in list(voc_test_pre2.word) if x not in list(fitur_pre2.feature)]

# Show info of oov from each test dataset
# df_pre2-info
print("\nBanyak out-of-vocab dari data test_pre2:", len(oov_pre2))
print("Percentage of oov of the overall vocab:", len(oov_pre2) / len(voc_test_pre2))


Banyak out-of-vocab dari data test_pre2: 815
Percentage of oov of the overall vocab: 0.38048552754435105


In [21]:
# Create dataframe of each 
oov_pre2 = voc_test_pre2[voc_test_pre2.word.isin(oov_pre2)].reset_index(drop=True)
oov_pre2.head(3)

Unnamed: 0,word,freq_dep,freq_nonDep
0,abilities,0,1
1,abortion,1,0
2,abt,0,1


Print oov info for each label of test data

In [22]:
print("\nBanyak oov_pre2 yang muncul pada data depresi",len(oov_pre2[oov_pre2.freq_dep > 0]))
print("Banyak oov_pre2 yang muncul pada data non-depresi",len(oov_pre2[oov_pre2.freq_nonDep > 0]))


Banyak oov_pre2 yang muncul pada data depresi 524
Banyak oov_pre2 yang muncul pada data non-depresi 312


# Check the wrong_classification before change of alpha that contains oov

In [22]:
# Function to extract oov from dataset
def extract_oov(dataset, oov):
    tok_dataset = dataset.text.apply(lambda x: str(x).split())
    oov_dataset = tok_dataset.apply(lambda x: [w for w in x if w in list(oov.word)])
    dataset['oov'] = list(oov_dataset)
    dataset['oov_count'] = dataset.oov.apply(lambda x: len(x))
    return dataset

-- Show info of how many wrong classified that contains oov -- 

In [23]:
wr_classified_pre2[4] = extract_oov(wr_classified_pre2[4], oov_pre2) # alpha=1
wr_classified_pre2[3] = extract_oov(wr_classified_pre2[3], oov_pre2) # alpha = 0.1

# Output info of how many wrong classified data with oov from each datasets
print("banyak data yg terklasifikasi salah pada pre2 dengan oov:", \
      len(wr_classified_pre2[4][wr_classified_pre2[4].oov_count > 0]), 'dari', len(wr_classified_pre2[4]))

banyak data yg terklasifikasi salah pada pre2 dengan oov: 30 dari 35


In [24]:
wr_classified_pre2[4] = extract_oov(wr_classified_pre2[4], oov_pre2) # alpha=1
wr_classified_pre2[3] = extract_oov(wr_classified_pre2[3], oov_pre2) # alpha = 0.1

# Output info of how many wrong classified data with oov from each datasets
print("banyak data yg terklasifikasi salah pada pre1 dengan oov:", \
      len(wr_classified_pre2[3][wr_classified_pre2[3].oov_count > 0]), 'dari', len(wr_classified_pre2[3]))

banyak data yg terklasifikasi salah pada pre1 dengan oov: 27 dari 30
