# NLP Tutorial

### Elena Kochkina

NESTA HackSTIR

22.10.2019

# Part II. Text classification

## Imports

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('reuters')
from nltk.corpus import reuters
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import numpy
import warnings
warnings.filterwarnings("ignore")

## Reuters dataset

In [None]:
reuters.categories()

Getting training and testing sets of reuters dataset

In [None]:
documents_train = []
labels_train = []
documents_test = []
labels_test = []
#categories = reuters.categories()
categories = ['wheat','gold','ship','coffee','grain']
for cat in categories:
  print (cat)
  print (len(reuters.fileids(cat)))
  for fileid in reuters.fileids(cat):
    if fileid.startswith('training'):
      documents_train.append(reuters.raw(fileid))
      labels_train.append(cat)
    else:
      documents_test.append(reuters.raw(fileid))
      labels_test.append(cat)

In [None]:
print(len(documents_train))
print(documents_train[0])

## Text preprocessing

In [None]:
documents_train_preprocessed = []
for d in documents_train:
  newd = d.lower()
  newd = re.sub(r'[^A-Za-z0-9 ]+', '', newd)
  documents_train_preprocessed.append(newd)
  
documents_test_preprocessed = []
for d in documents_test:
  newd = d.lower()
  newd = re.sub(r'[^A-Za-z0-9 ]+', '', newd)
  documents_test_preprocessed.append(newd)
  

## BoW

In [None]:
vectorizer = CountVectorizer(stop_words='english')
X_train_bow = vectorizer.fit_transform(documents_train_preprocessed)
X_test_bow = vectorizer.transform(documents_test_preprocessed)

## Classifier w BoW features

In [None]:
clf = LinearSVC()
clf.fit(X_train_bow, labels_train)
Y_pred_bow = clf.predict(X_test_bow)

## Evaluation

In [None]:
print(accuracy_score(labels_test, Y_pred_bow))

## Word2vec

downloading word2vec model pre-trained on Google News corpus

In [None]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gzip -d GoogleNews-vectors-negative300.bin.gz
!ls

load the model

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
def preprocess_to_avgw2v(documents):
  documents_avgw2v = []
  for d in documents:
    words = nltk.word_tokenize(d)
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    num_features = 300  
    temp_rep = numpy.zeros(num_features)

    for w in words:
      if w in model:
        temp_rep+=model[w]
      
    sumw2v = temp_rep/len(words)
    documents_avgw2v.append(sumw2v)
      
  return documents_avgw2v
    

In [None]:
X_train_w2v = preprocess_to_avgw2v(documents_train_preprocessed)
X_test_w2v = preprocess_to_avgw2v(documents_test_preprocessed)

## Classifier w Word2vec features

In [None]:
clf = LinearSVC()
clf.fit(X_train_w2v, labels_train)
Y_pred_w2v = clf.predict(X_test_w2v)

## Evaluation

In [None]:
print(accuracy_score(labels_test, Y_pred_w2v))