In [6]:
import pandas as pd
import math
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter("ignore")
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import string

In [2]:
#Tokenizer with added lemmatization and stemming options
class My_Tokenizer(object):
    def __init__(self, lemma=False, stem=False):
        self.lemma = lemma
        self.stem = stem
        self.Lemmatizer = WordNetLemmatizer()
        self.Stemmer = PorterStemmer()
        self.translate_table = dict((ord(char), None) for char in string.punctuation)
        self.stopwords = stopwords.words('english')
    def __call__(self, sentence):
        sentence = sentence.translate(self.translate_table)
        if self.lemma:
            return [self.Lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence) if not word.lower() in self.stopwords]
        elif self.stem:
            return [self.Stemmer.stem(word.lower()) for word in word_tokenize(sentence) if not word.lower() in self.stopwords]
        else:
            return [word.lower() for word in word_tokenize(sentence) if not word.lower() in self.stopwords]

In [3]:
class glove():
    def __init__(self) -> None:
        #self.glove = glove_loaded
        with open('data.pickle', 'rb') as f:
            self.glove = pickle.load(f)
            f.close()
        self.tokenizer = My_Tokenizer()
    def fit():
        pass
    def fit_transform(self, data):
        return self.transform(data)
    def transform(self, data):
        results = []
        for document in data:
            document = self.tokenizer(document)
            vector = np.zeros(300)
            for word in document:
                if word in self.glove:
                    vector += self.glove[word]
            results.append(vector)
        results = np.array(results)
        return results



In [10]:
dataset = pd.read_csv("./dataframes/dataset.csv")

X = dataset["X"].to_numpy()
Y = dataset["Y"].to_numpy()

In [12]:


#split into train and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42, shuffle=True)

#Preprocessing methods
method1 = TfidfVectorizer(tokenizer=My_Tokenizer(),stop_words="english", lowercase=True, strip_accents="unicode") 
method2 = TfidfVectorizer(tokenizer=My_Tokenizer(stem=True),stop_words="english", lowercase=True, strip_accents="unicode")
method3 = TfidfVectorizer(tokenizer=My_Tokenizer(lemma=True),stop_words="english", lowercase=True, strip_accents="unicode")
method4 = glove()
dataset_methods = [method1, method2, method3]
#print(dataset_methods)
vectorizer = method4
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

model = MLPClassifier(hidden_layer_sizes=(5,3))

model.fit(X_train,Y_train)

print(sum(model.predict(X_test) == Y_test)/len(Y_test))



MLPClassifier(hidden_layer_sizes=(5, 3))
0.9927398528480241


In [None]:
#prediction count
unique, counts = np.unique(model.predict(X_test), return_counts=True)
print(unique)
print(counts)
#true count
unique, counts = np.unique(Y_test, return_counts=True)
print(unique)
print(counts)


In [19]:
unique, counts = np.unique(Y, return_counts=True)
print(unique)
print(counts)

['productive' 'unproductive']
[214435 196023]


In [None]:
#prediction count
unique, counts = np.unique(model.predict(X_test), return_counts=True)
print(unique)
print(counts)
#true count
unique, counts = np.unique(Y_test, return_counts=True)
print(unique)
print(counts)


In [58]:
value = vectorizer.transform(["In condensed matter physics, a Bose–Einstein condensate (BEC) is a state of matter that is typically formed when a gas of bosons at very low densities is cooled to temperatures very close to absolute zero (−273.15 °C or −459.67 °F). Under such conditions, a large fraction of bosons occupy the lowest quantum state, at which microscopic quantum mechanical phenomena, particularly wavefunction interference, become apparent macroscopically."])
print(model.predict(value))

['productive']


In [56]:
# save model
pickle.dump(model, open("model.pickle", "wb"))

# load model
#loaded_model = pickle.load(open(filename, "rb"))