In [1]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import os
import pickle

In [2]:
data_path = '../../dataset'

labels = []
tokens = []

titles = []

for i, f in enumerate(os.listdir(data_path)):
    full_path = f'{ data_path }/{ f }'
    titles.append(f.split('.')[0])
    
    with open(full_path, 'r', encoding='utf8', errors='ignore') as infile:
        for line in infile:
            tokens.append(line.strip())
            labels.append(i)

df = pd.DataFrame(list(zip(tokens, labels)), columns=['sent', 'label'])

In [3]:
df.sample(5)

Unnamed: 0,sent,label
1574,Did you know that the travel and tourism indus...,6
628,The formula weight of an ionic compound is cal...,2
237,microorganism out there that can do the job. W...,0
39,New Tissue Clearing Methods Offer a Window int...,0
346,With gold moving sideways and cryptocurrencies...,1


In [4]:
titles

['biotechnology',
 'business_economics',
 'chemical_engineering',
 'computer_science',
 'electrical_engineering',
 'geography',
 'hospitality_&_tourism',
 'law_school',
 'medical_school',
 'visual_design']

In [5]:
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

df['cleaned'] = df['sent'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stop_words]).lower())

In [6]:
df.sample(5)

Unnamed: 0,sent,label,cleaned
405,Every day we hear about economically related p...,1,everi day hear econom relat problem global scale
96,so antibody labeling will take two days longer...,0,antibodi label take two day longer whole mous ...
1001,The sooner you realize that harmonics problems...,4,the sooner realiz harmon problem rise better
635,A chemical equation describes what happens in ...,2,a chemic equat describ happen chemic reaction
269,Promotions and advertising managers are respon...,1,promot advertis manag respons implement market...


In [7]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['cleaned']).toarray()
final_features.shape

(2504, 2413)

In [8]:
x = df['cleaned']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [18]:
v = 6
cl = 'mnb'

pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1700)),
                     ('clf', MultinomialNB())])

model = pipeline.fit(x_train, y_train)
with open(f'{ cl }_model_v{ v }_c{ len(titles) }_e{ int(len(labels)/len(titles))}.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

In [19]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.93      0.86      0.90        65
           1       0.94      0.90      0.92        67
           2       0.86      0.86      0.86        59
           3       0.86      0.90      0.88        60
           4       0.86      0.91      0.89        56
           5       0.86      0.90      0.88        60
           6       0.85      0.85      0.85        65
           7       0.90      0.87      0.89        63
           8       0.92      0.93      0.93        60
           9       0.92      0.92      0.92        71

    accuracy                           0.89       626
   macro avg       0.89      0.89      0.89       626
weighted avg       0.89      0.89      0.89       626



In [20]:
confusion_matrix(y_test, model.predict(x_test))


array([[56,  1,  3,  2,  0,  1,  0,  1,  1,  0],
       [ 0, 60,  0,  1,  1,  1,  4,  0,  0,  0],
       [ 1,  0, 51,  1,  3,  2,  0,  0,  0,  1],
       [ 0,  1,  1, 54,  2,  0,  1,  1,  0,  0],
       [ 1,  0,  1,  1, 51,  0,  0,  0,  0,  2],
       [ 1,  1,  1,  0,  0, 54,  1,  0,  0,  2],
       [ 0,  0,  1,  2,  2,  1, 55,  2,  1,  1],
       [ 0,  0,  0,  1,  0,  4,  1, 55,  2,  0],
       [ 0,  0,  1,  0,  0,  0,  3,  0, 56,  0],
       [ 1,  1,  0,  1,  0,  0,  0,  2,  1, 65]], dtype=int64)

In [21]:
s = "I like to learn about biology, human bodies and cure people"

In [31]:
data = [s]
prep_data = []

for d in data:
    d = [stemmer.stem(dt).lower() for dt in d.split(' ')]
    d = [dt for dt in d if dt not in stop_words]
    prep_data.append(' '.join(d))

In [32]:
prep_data

['like learn biology, human bodi cure peopl']

In [33]:
res = model.predict(prep_data)
lab_res = [titles[r] for r in res]

for r in res:
    print(titles[r])

medical_school


In [34]:
res = model.predict_proba(prep_data)

print(res)

[[0.05289794 0.05144183 0.03617351 0.06150873 0.04020889 0.14514107
  0.07093944 0.09272159 0.39733042 0.05163658]]
