In [38]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
import os
import pickle

In [5]:
data_path = '../../dataset'

labels = []
tokens = []

titles = []

for i, f in enumerate(os.listdir(data_path)):
    full_path = f'{ data_path }/{ f }'
    titles.append(f.split('.')[0])
    
    with open(full_path, 'r', encoding='utf8', errors='ignore') as infile:
        for line in infile:
            tokens.append(line.strip())
            labels.append(i)

df = pd.DataFrame(list(zip(tokens, labels)), columns=['sent', 'label'])

In [6]:
df.sample(5)

Unnamed: 0,sent,label
1185,Geographers examine qualitative and quantitati...,5
405,Every day we hear about economically related p...,1
944,The core is manufactured from a lamination of ...,4
1492,he domestic marine tourism market offers signi...,6
1383,Travel agents provide consulting services to c...,6


In [7]:
titles

['biotechnology',
 'business_economics',
 'chemical_engineering',
 'computer_science',
 'electrical_engineering',
 'geography',
 'hospitality_&_tourism',
 'law_school',
 'medical_school',
 'visual_design']

In [8]:
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

df['cleaned'] = df['sent'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stop_words]).lower())

In [9]:
df.sample(5)

Unnamed: 0,sent,label,cleaned
1709,"When people think of lawyers, many envision so...",7,when peopl think lawyer mani envis someon argu...
1783,Attorneys have stood at the center of society ...,7,attorney stood center societi centuri
860,"The sine component of the frequency f, known a...",4,the sine compon frequenc f known fundament com...
1357,The study of hospitality management covers imp...,6,the studi hospit manag cover import knowledg s...
1020,"High currents will cause lines to sag, reducin...",4,high current caus line sag reduc ground cleara...


In [10]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['cleaned']).toarray()
final_features.shape

(2303, 2241)

In [11]:
x = df['cleaned']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [52]:
v = 1
cl = 'mnb'

pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1700)),
                     ('clf', MultinomialNB())])

model = pipeline.fit(x_train, y_train)
with open(f'{ cl }_model_v{ v }_c{ len(titles) }_e{ int(len(labels)/len(titles))}.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

In [49]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        58
           1       1.00      0.82      0.90        49
           2       0.98      0.81      0.89        53
           3       0.91      0.89      0.90        47
           4       0.90      0.92      0.91        71
           5       0.79      0.94      0.86        52
           6       0.87      0.92      0.89        64
           7       0.93      0.90      0.91        70
           8       0.80      0.93      0.86        60
           9       0.94      0.87      0.90        52

    accuracy                           0.89       576
   macro avg       0.89      0.88      0.89       576
weighted avg       0.89      0.89      0.89       576



In [50]:
confusion_matrix(y_test, model.predict(x_test))


array([[48,  0,  0,  2,  3,  0,  1,  1,  3,  0],
       [ 1, 40,  0,  0,  0,  0,  2,  4,  2,  0],
       [ 2,  0, 43,  0,  1,  4,  1,  0,  2,  0],
       [ 0,  0,  0, 42,  2,  0,  1,  0,  1,  1],
       [ 1,  0,  0,  0, 65,  2,  0,  0,  2,  1],
       [ 1,  0,  0,  1,  0, 49,  1,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  1, 59,  0,  2,  0],
       [ 0,  0,  0,  0,  1,  3,  1, 63,  1,  1],
       [ 0,  0,  0,  0,  0,  2,  2,  0, 56,  0],
       [ 3,  0,  1,  1,  0,  1,  0,  0,  1, 45]], dtype=int64)

In [42]:
s = "president putin is my idol. I like to learn about government, politician, and law"

In [43]:
data = [s]
prep_data = []

for d in data:
    d = [stemmer.stem(dt).lower() for dt in d.split(' ')]
    d = [dt for dt in d if dt not in stop_words]
    prep_data.append(' '.join(d))

In [44]:
prep_data

['presid putin is my idol. i like to learn about government, politician, and law']

In [55]:
res = model.predict(prep_data)
lab_res = [titles[r] for r in res]

for r in res:
    print(titles[r])

law_school


In [58]:
model['clf'].class_log_prior_

array([-2.19664571, -2.43686124, -2.46370849, -2.42370316, -2.26675527,
       -2.16587405, -2.2283944 , -2.26118423, -2.19664571, -2.45019477])

In [62]:
model['clf'].n_features_

1700

In [63]:
res = model.predict_proba(prep_data)

print(res.round(2))

[[0.05 0.07 0.02 0.1  0.05 0.09 0.04 0.46 0.08 0.05]]
