In [23]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
import os
import pickle

In [79]:
data_path = '../../dataset'

labels = []
tokens = []

titles = []

for i, f in enumerate(os.listdir(data_path)):
    full_path = f'{ data_path }/{ f }'
    titles.append(f.split('.')[0])
    
    with open(full_path, 'r', encoding='utf8', errors='ignore') as infile:
        for line in infile:
            tokens.append(line.strip())
            labels.append(i)

df = pd.DataFrame(list(zip(tokens, labels)), columns=['sent', 'label'])

In [80]:
df.sample(5)

Unnamed: 0,sent,label
183,"As a result, the term market economy refers to...",0
170,"The word economy is Greek and means ""household...",0
80,focused on attracting new business and new cli...,0
551,"Once you’ve got the basic IT skills down, look...",2
56,develop effectual reasoning.,0


In [81]:
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

df['cleaned'] = df['sent'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stop_words]).lower())

In [82]:
df.sample(5)

Unnamed: 0,sent,label,cleaned
656,I guide emerging designers to develop their vi...,3,i guid emerg design develop visual intuit i de...
223,The nucleus of an atom contains protons and ne...,1,the nucleu atom contain proton neutron
248,"Liquids, solids, and gasses all may be mixed t...",1,liquid solid gass may mix form colloid dispers
29,include several introductory financial managem...,0,includ sever introductori financi manag cours ...
129,A dividend income stock will usually have a hi...,0,a dividend incom stock usual higher dividend y...


In [83]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['cleaned']).toarray()
final_features.shape

(801, 886)

In [84]:
x = df['cleaned']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [85]:
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=600)),
                     ('clf', RandomForestClassifier())])

model = pipeline.fit(x_train, y_train)
with open('RandomForest.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

In [86]:
classification_report(y_test, model.predict(x_test))

'              precision    recall  f1-score   support\n\n           0       0.66      0.98      0.79        47\n           1       0.98      0.78      0.87        58\n           2       0.93      0.91      0.92        44\n           3       0.93      0.75      0.83        52\n\n    accuracy                           0.85       201\n   macro avg       0.87      0.85      0.85       201\nweighted avg       0.88      0.85      0.85       201\n'

In [87]:
confusion_matrix(y_test, model.predict(x_test))


array([[46,  0,  1,  0],
       [ 9, 45,  1,  3],
       [ 4,  0, 40,  0],
       [11,  1,  1, 39]], dtype=int64)

In [120]:
# s = ['I like to program and solve problems. I always excel at mathematics and problem solving. I would like to learn critical thinking.']
s = ['I love chemical reactions that we do in laboratory and classes. I also want to learn about nuclear theories. Sometimes i add to much volume of electron in my solution']

In [121]:
res = model.predict(s)
lab_res = [titles[r] for r in res]

for r in res:
    print(titles[r])

chemical_engineering
