In [10]:
import pandas as pd
import math
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter("ignore")

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import string

In [11]:
#Tokenizer with added lemmatization and stemming options
class My_Tokenizer(object):
    def __init__(self, lemma=False, stem=False):
        self.lemma = lemma
        self.stem = stem
        self.Lemmatizer = WordNetLemmatizer()
        self.Stemmer = PorterStemmer()
        self.translate_table = dict((ord(char), None) for char in string.punctuation)  
    def __call__(self, sentence):
        #remove punctuation
        sentence = sentence.translate(self.translate_table)
        if self.lemma:
            return [self.Lemmatizer.lemmatize(t) for t in word_tokenize(sentence)]
        elif self.stem:
            return [self.Stemmer.stem(t) for t in word_tokenize(sentence)]
        else:
            return [t for t in word_tokenize(sentence)]

In [12]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_loaded = load_glove_model("glove.6B.300d.txt")

Loading Glove Model
400000 words loaded!


In [14]:
import pickle
with open('data.pickle', 'wb') as f:
    pickle.dump(glove_loaded, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [25]:
with open('data.pickle', 'r') as f:
    glove_loaded = f


In [28]:



class glove():
    def __init__(self) -> None:
        #self.glove = glove_loaded
        with open('data.pickle', 'r') as f:
            self.glove = pickle.load(f)
        self.tokenizer = My_Tokenizer()
    def fit():
        pass
    def fit_transform(self, data):
        return self.transform(data)
    def transform(self, data):
        results = []
        for document in data:
            document = self.tokenizer(document)
            vector = np.zeros(300)
            for word in document:
                if word in self.glove:
                    vector += self.glove[word]
            results.append(vector)
        results = np.array(results)
        return results



In [29]:


dataset = pd.read_csv("./dataframes/dataset.csv")

X = dataset["X"].to_numpy()
Y = dataset["Y"].to_numpy()

#split into train and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42, shuffle=True)

#Preprocessing methods
method1 = TfidfVectorizer(tokenizer=My_Tokenizer(),stop_words="english", lowercase=True, strip_accents="unicode") 
method2 = TfidfVectorizer(tokenizer=My_Tokenizer(stem=True),stop_words="english", lowercase=True, strip_accents="unicode")
method3 = TfidfVectorizer(tokenizer=My_Tokenizer(lemma=True),stop_words="english", lowercase=True, strip_accents="unicode")
method4 = glove()
dataset_methods = [method1, method2, method3]
#print(dataset_methods)
vectorizer = method4
print(Y_train)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

model = KNN(n_neighbors=5)
model = MLPClassifier(hidden_layer_sizes=(5,3))

print(model.fit(X_train,Y_train))

print(sum(model.predict(X_test) == Y_test)/len(Y_test))



['unproductive' 'unproductive' 'unproductive' ... 'unproductive'
 'unproductive' 'unproductive']


ValueError: I/O operation on closed file.

In [None]:
print(sum(model.predict(X_test) == Y_test)/len(Y_test))


In [None]:
#prediction count
unique, counts = np.unique(model.predict(X_test), return_counts=True)
print(unique)
print(counts)
#true count
unique, counts = np.unique(Y_test, return_counts=True)
print(unique)
print(counts)


In [79]:
value = vectorizer.transform(["today lecture will be on C++ arrays"])
print(model.predict(value))

['productive']
