In [31]:
import numpy as np
import pandas as pd
import sklearn 
import nltk
from nltk.stem import RSLPStemmer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score

In [32]:
def Stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return np.array(phrase)

In [33]:
storeList = ["magazineluiza", "ricardoeletro"]
store = storeList[0]
dataset = pd.read_csv(f'data/{store}/clf.csv')
X = dataset.values[:,:-1].astype(np.int)
y = dataset.values[:, -1].astype(np.int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)
dataset.head()

Unnamed: 0,aaa,acelerometro,acesso,acessorios,acionador,action,active,adf,agt,agua,...,xtrax,zakl,zbkl,zckl,zdkl,zeblaze,zekl,zenfone,zoom,y
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0


In [34]:
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [35]:
acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)

print(f"Acc: {acc}, F1: {f1}")
print(confusion_matrix(y_test, y_pred))

Acc: 0.9642857142857143, F1: 0.9659090909090909
[[77  3]
 [ 3 85]]


## Stemming

In [36]:
new_col = np.unique(Stemming(dataset.columns[:-1]))
new_x = np.zeros([X.shape[0], new_col.shape[0]])
print(X.shape, new_x.shape)

(838, 737) (838, 686)


In [37]:
for old_idx, word in enumerate(dataset.columns[:-1]):
    st_word = Stemming([word])
    new_idx = np.where(st_word == new_col)[0][0]
 
    new_x[:, new_idx] += X[:, old_idx]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    new_x, y, test_size=0.20, random_state=42)

In [39]:
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [40]:
acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)

print(f"Acc: {acc}, F1: {f1}")
print(confusion_matrix(y_test, y_pred))

Acc: 0.9583333333333334, F1: 0.96045197740113
[[76  4]
 [ 3 85]]
