# Step 1: Load a dataset 

In [1]:
# Load the 20 Newsgrous data set - a collection of 12k documents grouped among 20 different categories.
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [2]:
# Find information about data set
dir(twenty_train)
print(twenty_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [3]:
# Let's use the sklearn's feature CountVectorizer (Example)
#Check the full documentation here: 
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is a new document', 'another document presented at FMI', 'FMI is teaching information retrieval']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names()) # ALl vocabulary words found in corpus. 
#CHeck doc, you can set your own requested words
print(X.toarray())

['another', 'at', 'document', 'fmi', 'information', 'is', 'new', 'presented', 'retrieval', 'teaching', 'this']
[[0 0 1 0 0 1 1 0 0 0 1]
 [1 1 1 1 0 0 0 1 0 0 0]
 [0 0 0 1 1 1 0 0 1 1 0]]


In [4]:
# Let's go back to extracting features from the real text
# This will construct the vocabulary and document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

feature_names = count_vect.get_feature_names() # maps from feature index to string (word)
#print(feature_names)

inv_feature_names = { value:i for i,value in enumerate(feature_names) } 
#print(feature_names)
#print(inv_feature_names)

print(X_train_counts.shape) # Matrix of numDocuments X numFeatures (words)
#print(X_train_counts[0])

(11314, 130107)


In [5]:
# Matrix X_train contains the raw term frequencies. We can manually compute the normalized tf by: raw_score(word) / total length of document
# We can manually find TF-IDF by totalling columns where a word is != 0
# But the code is already written for us !
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
#print(X_train_tfidf[0])

(11314, 130107)

In [6]:
print(twenty_train.target)

[7 4 4 ... 3 1 8]


# Step 2: Run Naive Bayes

In [10]:
# Train first
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) # Each row is a training entry, the corresponding index of twenty_train is category


In [11]:
# Let's check the performance
import numpy as np

# Load test data
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

# Convert the test data through the same pipeline as above
count_vect = CountVectorizer(vocabulary = inv_feature_names) # IMportant thing - we have to use the same features (vocabulary words as before)
X_test_counts = count_vect.fit_transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)

print(X_test_tfidf.shape)

# Predict data
predicted = clf.predict(X_test_tfidf) # Will get an array of classifications
res = np.mean(predicted == twenty_test.target) # Check how many succeded in average
print("Predicted accuracy: {0}%".format(res*100.0))


(7532, 130107)
Predicted accuracy: 77.07116303770579%


# Step 3: Support Vector Machines (SVM)

In [63]:
# More about it here - https://scikit-learn.org/stable/modules/svm.html
from sklearn.pipeline import Pipeline  # Independent of SVM could be used for aboves too
from sklearn.linear_model import SGDClassifier

# Define a pipeline to write less code
svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 
                    ('svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])


svm_clf.fit(twenty_train.data, twenty_train.target)
svm_predicted = svm_clf.predict(twenty_test.data)
res = np.mean(svm_predicted == twenty_test.target)
print("Predicted accuracy: {0}%".format(res*100.0))




Predicted accuracy: 82.38183749336166%


# Step 4: Optimizing with Grid search 
### Searches for parameter inside models (you could implement something similar for homework 2)


In [65]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range' : [(1,1), (1,2)], # Uni or bi gram ?
                'tfidf__use_idf' : (True, False), # Either to use idf or not
                'svm__alpha':(1e-2, 1e-3)}

gridSearch_svm = GridSearchCV(svm_clf, parameters, n_jobs=-1)
gridSearch_svm = gridSearch_svm.fit(twenty_train.data, twenty_train.target)


SyntaxError: invalid syntax (<ipython-input-65-d6316f08c20c>, line 9)

In [66]:
print("Best score: {0}".format( gridSearch_svm.best_score_))
print("Best params: {0}".format(gridSearch_svm.best_params_))

Best score: 0.8979140887396146
Best params: {'svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


# Homework 3 : Implement Naive bayes classifier yourself, similar to the one in the book. Use the same data set and make it close in terms of accuracy.

In [7]:
# Write your code here if you do it in python
import numpy as np

def trainMultinomialNB(trainMatrix, classOfDocument):
    documentsOfClass = []
    prior = []
    T = []
    nrOfClasses = 0
    N, nrOfWords = trainMatrix.shape
    # aflam numarul de clase
    for val in classOfDocument:
        if val > nrOfClasses:
            nrOfClasses = val
    nrOfClasses += 1
    for _ in range(nrOfClasses):
        documentsOfClass.append([])
    T = np.zeros((nrOfClasses, nrOfWords))
    condprob = np.zeros((nrOfClasses, nrOfWords))
    # aflam documentele care apartin unei clase
    for idx, val in enumerate(classOfDocument):
        documentsOfClass[val].append(idx)
    for c in range(nrOfClasses):
        Nc = len(documentsOfClass[c])
        prior.append(Nc/N)
        # calculam Tct
        for doc in documentsOfClass[c]:
            T[c] += trainMatrix[doc]
        sumOfClass = sum(T[c]) + nrOfWords
        # calculam condprob
        for word in range(nrOfWords):
            condprob[c,word] = np.log10((T[c,word] + 1)/sumOfClass)
    return prior, condprob

prior, condprob = trainMultinomialNB(X_train_tfidf, twenty_train.target)

In [8]:
def applyMultinomialNB(prior, condprob, doc):
    nrOfClasses, nrOfWords = condprob.shape
    N, _ = doc.shape
    arrayOfScores = []
    for idxOfDoc in range(N):
        score = []
        for c in range(nrOfClasses):
            score.append(np.log10(prior[c]))
            score[c] += condprob[c] * np.transpose(doc[idxOfDoc])
            #for word in range(nrOfWords):
            #    score[c] += condprob[c,word] * doc[0, word]
        maxim = score[0]
        idxOfMax = 0
        for c in range(nrOfClasses):
            if score[c] > maxim:
                maxim = score[c]
                idxOfMax = c
        arrayOfScores.append(idxOfMax)
    return arrayOfScores

In [9]:
# Load test data
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

# Convert the test data through the same pipeline as above
count_vect = CountVectorizer(vocabulary = inv_feature_names) # IMportant thing - we have to use the same features (vocabulary words as before)
X_test_counts = count_vect.fit_transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)


arrayOfScores = applyMultinomialNB(prior, condprob, X_test_tfidf)

#print(arrayOfScores)
#print(twenty_test.target)
res2 = np.mean(arrayOfScores == twenty_test.target) # Check how many succeded in average
print("Predicted accuracy: {0}%".format(res2*100.0))

Predicted accuracy: 77.07116303770579%


# Stemmer and stop words tutorial

In [72]:
import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer): # derivation from base class, need to rewrite a few functions
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
    
# Define a pipeline to write less code
svm_clf = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                    ('svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])


svm_clf.fit(twenty_train.data, twenty_train.target)
svm_predicted = svm_clf.predict(twenty_test.data)
res = np.mean(svm_predicted == twenty_test.target)
print("Predicted accuracy: {0}%".format(res*100.0))

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml




Predicted accuracy: 81.94370685077004%
