In [53]:
import pandas as pd
import numpy as np
from collections import defaultdict
from nltk.tag import pos_tag
from nltk.tag.perceptron import AveragedPerceptron
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
np.random.seed(420)

In [54]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /Users/dpapp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dpapp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/dpapp/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [55]:
Corpus = pd.read_csv(r"input/603_num.tsv", sep='\t')

Corpus.head()

Unnamed: 0,name,prefix,features,hours,cluster
0,Linear Regression,DS,Linear RegressionCorrelationRegression Analysi...,0.75,1
1,Statistical Analysis,DS,Statistical AnalysisDescriptive StatisticsInfe...,1.5,1
2,Logarithms,DS,LogarithmsExponential FunctionsLogarithmic Fun...,2.0,1
3,Arithmetics,DS,ArithmeticOperationsAdditionSubtractionMultipl...,0.5,1
4,Euclidean distance,DS,Euclidean DistanceDistance FormulaPythagorean ...,2.25,1


In [60]:
# Corpus['text'].dropna(inplace=True)

Corpus['features'] = [entry.lower() for entry in Corpus['features']]

Corpus['tokens']= [word_tokenize(entry) for entry in Corpus['features']]

tag_map = defaultdict(lambda : wn.NOUN)

tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(Corpus['tokens']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)
    Corpus.loc[index, 'text_final'] = str(final_words)

In [63]:
Corpus['label'] = ''
Corpus.head()

Unnamed: 0,name,prefix,features,hours,cluster,tokens,text_final,label
0,Linear Regression,DS,linear regressioncorrelationregression analysi...,0.75,1,"[linear, regressioncorrelationregression, anal...","['linear', 'regressioncorrelationregression', ...",
1,Statistical Analysis,DS,statistical analysisdescriptive statisticsinfe...,1.5,1,"[statistical, analysisdescriptive, statisticsi...","['statistical', 'analysisdescriptive', 'statis...",
2,Logarithms,DS,logarithmsexponential functionslogarithmic fun...,2.0,1,"[logarithmsexponential, functionslogarithmic, ...","['logarithmsexponential', 'functionslogarithmi...",
3,Arithmetics,DS,arithmeticoperationsadditionsubtractionmultipl...,0.5,1,[arithmeticoperationsadditionsubtractionmultip...,['arithmeticoperationsadditionsubtractionmulti...,
4,Euclidean distance,DS,euclidean distancedistance formulapythagorean ...,2.25,1,"[euclidean, distancedistance, formulapythagore...","['euclidean', 'distancedistance', 'formulapyth...",


In [64]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'], test_size=0.3)

Encoder = LabelEncoder()

Train_Y = Encoder.fit_transform(Train_Y)

Test_Y = Encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features=5000)

Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)

Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [65]:
Encoder = LabelEncoder()

Train_Y = Encoder.fit_transform(Train_Y)

Test_Y = Encoder.fit_transform(Test_Y)

In [66]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [67]:
print(Tfidf_vect.vocabulary_)

{'linear': 432, 'regressioncorrelationregression': 700, 'analysisleast': 50, 'squarescoefficient': 801, 'determinationregression': 212, 'lineindependent': 434, 'variabledependent': 910, 'variablepredictionoutliersresidualsconfidence': 911, 'intervalhypothesis': 403, 'testinganovastatistical': 848, 'inferencemultiple': 386, 'regressionmodel': 701, 'selectionregularizationridge': 752, 'regressionthis': 703, 'module': 522, 'design': 209, 'introduce': 404, 'student': 817, 'regression': 699, 'powerful': 609, 'statistical': 807, 'technique': 838, 'use': 901, 'model': 500, 'relationship': 707, 'two': 886, 'variable': 909, 'interactive': 398, 'lesson': 426, 'activity': 10, 'learn': 420, 'build': 109, 'interpret': 400, 'result': 723, 'make': 450, 'prediction': 613, 'draw': 239, 'conclusion': 159, 'data': 192, 'cover': 182, 'assumption': 87, 'limitation': 430, 'well': 930, 'validate': 906, 'improve': 380, 'also': 40, 'different': 222, 'type': 887, 'include': 381, 'simple': 770, 'multiple': 527, 

In [68]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  100.0


In [71]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='auto', max_iter=100)
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

ValueError: The number of classes has to be greater than one; got 1 class