In [1]:
import spacy 
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [2]:
L  = list(nlp.vocab.strings)

In [3]:
numwords = len(L)
print(numwords)

84780


In [4]:
W2I = dict(zip(L,np.arange(numwords)))
I2W  = dict(zip(np.arange(numwords),L))

In [6]:
def oneHotVector(word,W2I,numwords):
    v = np.zeros(numwords)
    v[W2I[word]] = 1
    return v

In [7]:
v = oneHotVector('game',W2I,numwords)

In [11]:
v[49865]

1.0

In [10]:
W2I['game']

49865

In [12]:
doc = 'how are you today. I know most of the time how you feel.'
tokens = [token.text for token in nlp(doc)]

In [13]:
tokens

['how',
 'are',
 'you',
 'today',
 '.',
 'I',
 'know',
 'most',
 'of',
 'the',
 'time',
 'how',
 'you',
 'feel',
 '.']

In [15]:
v = np.zeros(numwords)
for token in tokens :
    v += oneHotVector(token, W2I,numwords)

In [16]:
v

array([0., 0., 0., ..., 0., 0., 0.])

In [18]:
v[W2I['.']]

2.0

# TFIDF Implementation from scratch:

In [19]:
from sklearn.datasets import fetch_20newsgroups as getData

In [20]:
corpus  = getData(subset='train',remove=('headers','footers','quotes'))

In [21]:
docs  = corpus.data
docs

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't an

In [22]:
len(docs)

11314

In [24]:
df = np.zeros(numwords)

In [26]:
for term in L:
    dft = 0
    for doc in docs[:100]:
        if term in doc:
            dft+=1
    df[W2I[term]]=dft

In [27]:
idf = np.log10(100/(df+1))

  idf = np.log10(100/df)


In [61]:
%%time
tf = np.zeros((100,numwords))
#for i,term in enumerate(L):
for j,doc in enumerate(docs[:100]):
    for i,term in enumerate(L):
        
        count = doc.split().count(term)
        #print(count,term)
        tf[j,i] = count
    

CPU times: total: 2min 34s
Wall time: 2min 39s


In [60]:
%%time
tf2 = np.zeros((100,numwords))
print(tf2.shape)
print(tf2[1,W2I['I']])
doc = docs[:100]
for (j,sent) in enumerate(doc):
     for token in nlp(sent):
        try:
     #       print(j,token)
            tf2[j,W2I[token.text]] += 1
        except KeyError :
            pass
#%time

(100, 84780)
0.0
CPU times: total: 4.3 s
Wall time: 4.3 s


In [57]:
W2I['enlighten']

KeyError: 'enlighten'

In [39]:
docs[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

## Topic Modelling:

In [71]:
def loadCorpus():
    corpus = getData(subset='train',remove=('headers','footers','quotes'),
                     categories=['alt.atheism','comp.graphics','soc.religion.christian'])
    return corpus

In [72]:
corpus = loadCorpus()
corpus

{'data': ["\n\nI would rather be at a higher risk of being killed than actually killed by\n                              ^^^^                      ^^^^^^^^\nmistake.  Though I do agree with the concept that the type D and E murderers\nare a massive waste of space and resources I don't agree with the concept:\n\n\tkilling is wrong\n\tif you kill we will punish you\n\tour punishment will be to kill you.\n\nSeems to be lacking in consistency.\n",
  'It\'s like refusing \'God\'s kingdom come\'.\n\nIn one of Jesus\' revelation in this century, "...same thing as in\nthe old days.  People refuse to believe my messengers.  Even when\nI was alive here on earth, they refuse Me.  What more when I am just\ntalking through somebody else?" (paraphrased).\n\nWith all the knowledge believers accumulated, He would think that\nwe would be \'enlightened\' enough to detect which ones are \n\'authentic and divine\' as opposed to \'evil or man-made\'.\n\nThese signs, these miracles, are you afraid that they

In [65]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [74]:
def buildTFIDFModel(docs):
    cv = CountVectorizer()
    ct = cv.fit(docs)
    counts = ct.transform(docs)
    tfidf = TfidfTransformer().fit(counts)
    return ct,tfidf

In [76]:
%%time
ct,tfidf = buildTFIDFModel(corpus.data)

CPU times: total: 625 ms
Wall time: 631 ms


In [79]:
def computeTFIDFFeatures(docs,ct,tfidf):
    counts = ct.transform(docs)
    xF = tfidf.transform(counts)
    return xF.toarray()

In [80]:
xF= computeTFIDFFeatures(corpus.data,ct,tfidf)

In [83]:
from sklearn.linear_model import LogisticRegression

In [84]:
model = LogisticRegression().fit(xF,corpus.target)

In [92]:
new_docs = ['god loves everyone ','OpenGL is very good','there is no one','internet is shit','there is no one']
new_xf = computeTFIDFFeatures(new_docs,ct,tfidf)
predicted  = model.predict(new_xf)

In [93]:
for i in range(len(new_docs)):
    print(corpus.target_names[predicted[i]])

soc.religion.christian
comp.graphics
alt.atheism
comp.graphics
alt.atheism
