In [1]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
import os
import pickle

In [2]:
data_path = '../../dataset'

labels = []
tokens = []

titles = []

for i, f in enumerate(os.listdir(data_path)):
    full_path = f'{ data_path }/{ f }'
    titles.append(f.split('.')[0])
    
    with open(full_path, 'r', encoding='utf8', errors='ignore') as infile:
        for line in infile:
            tokens.append(line.strip())
            labels.append(i)

df = pd.DataFrame(list(zip(tokens, labels)), columns=['sent', 'label'])

In [3]:
df.sample(5)

Unnamed: 0,sent,label
974,Web developers assess the needs of users for i...,3
2158,"intelligent, motivated critical thinker, any a...",7
1647,protect a natural environment and raise awaren...,5
1372,IP or ingress protection,4
1104,Freelance developers enjoy higher wages and mo...,3


In [4]:
titles

['biotechnology',
 'business_economics',
 'chemical_engineering',
 'computer_science',
 'electrical_engineering',
 'geography',
 'hospitality_&_tourism',
 'law_school',
 'medical_school',
 'visual_design']

In [5]:
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

df['cleaned'] = df['sent'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stop_words]).lower())

In [6]:
df.sample(5)

Unnamed: 0,sent,label,cleaned
1213,"In cases of this description, PFC systems that...",4,in case descript pfc system retrofit detun oft...
2689,"At its most basic level, visual communication ...",9,at basic level visual commun design formerli g...
1384,"Also referred to as load flow, power flow is t...",4,also refer load flow power flow analysi appar ...
2899,"Photography & artwork: resolution, DPI, PPI, b...",9,photographi artwork resolut dpi ppi bleed trim...
1175,"Even harmonics (2nd, 4th, 6th, etc.) as a rule...",4,even harmon nd th th etc rule occur due sudden...


In [7]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['cleaned']).toarray()
final_features.shape

(2934, 2678)

In [8]:
x = df['cleaned']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [21]:
v = 2
cl = 'logistic'

pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1700)),
                     ('clf', LogisticRegression())])

model = pipeline.fit(x_train, y_train)
with open(f'{ cl }_model_v{ v }_c{ len(titles) }_e{ int(len(labels)/len(titles))}.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

In [22]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.72      0.80      0.75        54
           1       0.86      0.85      0.86        74
           2       0.95      0.89      0.92        70
           3       0.95      0.81      0.87        90
           4       0.92      0.95      0.94        62
           5       0.77      0.93      0.84        82
           6       0.84      0.90      0.87        71
           7       0.93      0.82      0.87        66
           8       0.79      0.75      0.77        81
           9       0.89      0.90      0.90        84

    accuracy                           0.86       734
   macro avg       0.86      0.86      0.86       734
weighted avg       0.87      0.86      0.86       734



In [23]:
confusion_matrix(y_test, model.predict(x_test))


array([[43,  0,  2,  0,  0,  1,  0,  1,  5,  2],
       [ 2, 63,  0,  0,  0,  3,  4,  1,  0,  1],
       [ 2,  0, 62,  0,  1,  1,  0,  0,  3,  1],
       [ 3,  2,  0, 73,  2,  7,  1,  1,  0,  1],
       [ 0,  0,  1,  1, 59,  1,  0,  0,  0,  0],
       [ 0,  2,  0,  1,  1, 76,  0,  0,  0,  2],
       [ 0,  2,  0,  1,  1,  0, 64,  0,  2,  1],
       [ 0,  3,  0,  0,  0,  1,  3, 54,  4,  1],
       [ 9,  1,  0,  1,  0,  5,  3,  1, 61,  0],
       [ 1,  0,  0,  0,  0,  4,  1,  0,  2, 76]], dtype=int64)

In [24]:
s = "president jokowi is my idol. I like to learn about government, politician, and law"

In [25]:
data = [s]
prep_data = []

for d in data:
    d = [stemmer.stem(dt).lower() for dt in d.split(' ')]
    d = [dt for dt in d if dt not in stop_words]
    prep_data.append(' '.join(d))

In [26]:
prep_data

['presid putin idol. like learn government, politician, law']

In [27]:
res = model.predict(prep_data)
lab_res = [titles[r] for r in res]

for r in res:
    print(titles[r])

law_school


In [28]:
model['clf'].class_log_prior_

AttributeError: 'LogisticRegression' object has no attribute 'class_log_prior_'

In [62]:
model['clf'].n_features_

1700

In [17]:
res = model.predict_proba(prep_data)

print(res.round(2))

[[0.04 0.08 0.03 0.1  0.05 0.08 0.05 0.44 0.08 0.06]]
