In [5]:
import gensim
from gensim import utils
import numpy as np
import sys
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import os
import sys
import numpy as np
import string
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import pickle
import time

In [6]:
news = fetch_20newsgroups(subset='all')

In [10]:
type(news.target)

numpy.ndarray

In [10]:
stemmer = SnowballStemmer("english")

In [11]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [12]:
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))        
    return result

In [13]:
def load_glove_model(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [14]:
def sent_embedding(model,sentence):
    doc1 = [word for word in doc if word in model.keys()]
    sent_emb = np.mean([model[t] if t in model else model['unk'] for t in doc1 ],axis=0)
    return sent_emb

In [15]:
def train(classifier, X, y):
    start = time.time()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

    classifier.fit(X_train, y_train)
    end = time.time()

    print("Accuracy: " + str(classifier.score(X_test, y_test)) + ", Time duration: " + str(end - start))
    return classifier

In [16]:
#fitting newsdata in tfidf
vectorizer = TfidfVectorizer( stop_words=stopwords.words('english') + list(string.punctuation))
fit = vectorizer.fit_transform(news.data)
print(type(news.data))
print(fit.shape)

<class 'list'>
(18846, 173618)


In [17]:
classifier = LinearSVC(C=40,random_state=11)
targets = news.target
classifier.fit(fit, targets)



LinearSVC(C=40, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=11, tol=0.0001,
     verbose=0)

In [18]:
train(classifier,fit,targets)

Accuracy: 0.9336870026525199, Time duration: 64.79490399360657


LinearSVC(C=40, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=11, tol=0.0001,
     verbose=0)

In [19]:
#preprocessing news data for glove embeddings
processed_docs = []

for doc in news.data:
    processed_docs.append(preprocess(doc))

In [20]:
#just checking whether preprocessing went good 
print(news.data[:2])
print(processed_docs[:2])

["From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n", 'From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)\nSubject: Which hi

In [21]:
#loading glove 300d model
model=load_glove_model("E:/Genesis/glove.6B/glove.6B.300d.txt")

Loading Glove Model
Done. 400000  words loaded!


In [23]:
#look up each doc in model
embeddings =[]
for doc in processed_docs: 
    embeddings.append(sent_embedding(model, doc))

In [24]:
#how does embedding look for one doc
embeddings[0]

array([-8.21768983e-02,  1.91594415e-01,  4.73643678e-02, -1.19968824e-01,
        3.80117119e-02,  6.53641288e-02, -1.10010983e-01, -1.18436051e-01,
        1.43357983e-02, -6.46838392e-01,  6.71145102e-02, -7.81769000e-02,
       -3.94446780e-02,  1.70745763e-03,  1.29294576e-01,  1.27127034e-01,
       -1.01492898e-01, -1.45242034e-02,  6.80205085e-03, -2.95049322e-02,
        5.28671593e-02, -6.70104576e-02,  2.88601288e-02, -4.07195492e-02,
       -4.38682542e-02,  5.49512542e-02,  6.45782610e-02, -5.96864105e-02,
        1.35152871e-01, -3.59614407e-03,  2.82939424e-02, -3.27766102e-03,
       -2.27027288e-02, -2.44179661e-02, -9.11055593e-01, -7.60123220e-03,
       -1.97699644e-02,  1.49644068e-01, -3.85641186e-03, -2.80517186e-02,
        2.72594576e-02, -6.03006780e-03, -4.19913220e-02, -6.23525254e-02,
       -8.95754203e-02,  1.44762702e-01,  8.09564551e-02,  7.61392847e-02,
       -7.53171068e-02,  9.25236908e-02, -3.16560458e-02,  1.18373492e-01,
        7.17933088e-02, -

In [25]:
classifier_emb = LinearSVC(C=40, random_state=11)

In [26]:
classifier_emb=train(classifier_emb, embeddings, news.target)



Accuracy: 0.7342175066312997, Time duration: 210.19688820838928


In [28]:
#tfidif has way better accuracy then glove embeddings so pipelining tfidf
tfidfclassifier = Pipeline([ ('vectorizer', TfidfVectorizer( stop_words=stopwords.words('english') + list(string.punctuation))), ('classifier', LinearSVC(C=40,random_state=11))])

train(tfidfclassifier, news.data, news.target)



Accuracy: 0.9336870026525199, Time duration: 90.04454851150513


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=11, tol=0.0001,
     verbose=0))])

In [31]:
pickle.dump(tfidfclassifier, open('model.pkl','wb'))

In [32]:
MODEL = pickle.load( open('model.pkl','rb'))

In [36]:
def predict_news(model, news_str, newscategory):
    prediction=newscategory[model.predict([news_str])[0]]
    return prediction

In [37]:
test_str="India scored 3 goals against germany yesterday!"
result = predict_news(MODEL, test_str, news.target_names)
print(result)

rec.sport.hockey
