In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/587.7 MB 2.3 MB/s eta 0:04:15
     ---------------------------------------- 0.4/587.7 MB 3.7 MB/s eta 0:02:37
     ---------------------------------------- 0.5/587.7 MB 4.1 MB/s eta 0:02:24
     ---------------------------------------- 1.0/587.7 MB 5.2 MB/s eta 0:01:53
     ---------------------------------------- 1.3/587.7 MB 5.9 MB/s eta 0:01:40
     ---------------------------------------- 1.9/587.7 MB 6.6 MB/s eta 0:01:30
     ---------------------------------------- 2.4/587.7 MB 7.3 MB/s eta 0:01:21
     ---------------------------------------- 3.0/587.7 MB 8.0 MB/s eta 0:01:14
     ---------------------------------------- 3.6/587.7 MB 8.5 MB/s eta 0:01:09
     -------------------------

     ---------------------------------- --- 532.8/587.7 MB 6.7 MB/s eta 0:00:09
     ---------------------------------- --- 532.9/587.7 MB 6.7 MB/s eta 0:00:09
     ---------------------------------- --- 533.1/587.7 MB 6.5 MB/s eta 0:00:09
     ---------------------------------- --- 533.4/587.7 MB 6.5 MB/s eta 0:00:09
     ---------------------------------- --- 533.6/587.7 MB 6.5 MB/s eta 0:00:09
     ---------------------------------- --- 533.8/587.7 MB 6.3 MB/s eta 0:00:09
     ---------------------------------- --- 533.8/587.7 MB 6.2 MB/s eta 0:00:09
     ---------------------------------- --- 534.2/587.7 MB 6.1 MB/s eta 0:00:09
     ---------------------------------- --- 534.4/587.7 MB 6.0 MB/s eta 0:00:09
     ---------------------------------- --- 534.6/587.7 MB 6.0 MB/s eta 0:00:09
     ---------------------------------- --- 534.7/587.7 MB 6.0 MB/s eta 0:00:09
     ---------------------------------- --- 534.8/587.7 MB 5.8 MB/s eta 0:00:10
     ---------------------------------- 

In [3]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [8]:
doc = nlp("dog cat banana kem")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov) #OOV out of vocabulary 

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
kem Vector: False OOV: True


In [10]:
doc[0].vector.shape

(300,)

In [11]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [12]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}", token.similarity(base_token))

bread <-> bread 0.9999999744752309
sandwich <-> bread 0.6341067010130894
burger <-> bread 0.47520687769584247
car <-> bread 0.06451532596945217
tiger <-> bread 0.04764611272488976
human <-> bread 0.2151154210812192
wheat <-> bread 0.615036141030184


In [13]:
def print_similarity(base_word, word_to_compare):
    base_token = nlp(base_word)
    doc = nlp(word_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}", token.similarity(base_token))

In [14]:
#if two words are appearing the same context they will have more similarity
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone 0.4387907748060368
samsung <-> iphone 0.6708590303423401
iphone <-> iphone 0.9999999983096304
dog <-> iphone 0.08211864228011527
kitten <-> iphone 0.10222317834969896


In [15]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity([result], [queen]))

[[0.6178014]]


## Text Classification using Spacy Word Vector

In [17]:
import pandas as pd

In [18]:
df = pd.read_csv("Fake_Real_Data.csv")

In [19]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [20]:
df["Text"][0]

' Top Trump Surrogate BRUTALLY Stabs Him In The Back: ‘He’s Pathetic’ (VIDEO) It s looking as though Republican presidential candidate Donald Trump is losing support even from within his own ranks. You know things are getting bad when even your top surrogates start turning against you, which is exactly what just happened on Fox News when Newt Gingrich called Trump  pathetic. Gingrich knows that Trump needs to keep his focus on Hillary Clinton if he even remotely wants to have a chance at defeating her. However, Trump has hurt feelings because many Republicans don t support his sexual assault against women have turned against him, including House Speaker Paul Ryan (R-WI). So, that has made Trump lash out as his own party.Gingrich said on Fox News: Look, first of all, let me just say about Trump, who I admire and I ve tried to help as much as I can. There s a big Trump and a little Trump. The little Trump is frankly pathetic. I mean, he s mad over not getting a phone call? Trump s referr

In [23]:
df.shape

(9900, 2)

In [25]:
#to check value count
df["label"].value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [26]:
df["label_encoded"] = df["label"].map({
    "Fake" : 0,
    "Real" : 1
})

df.head()

Unnamed: 0,Text,label,label_encoded
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [28]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [29]:
df["Vector"] = df["Text"].apply(lambda text: nlp(text).vector)

In [30]:
df.head()

Unnamed: 0,Text,label,label_encoded,Vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-2.7812982, -0.16120885, -1.609772, 1.3624227..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ..."


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(
    df["Vector"],
    df["label_encoded"],
    random_state=42,
    stratify = df["label_encoded"]
)

In [35]:
import numpy as np
X_train.shape

(7425,)

In [36]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [37]:
X_train_2d

array([[-1.3324177 ,  1.0344796 , -1.8901595 , ..., -0.23376821,
        -0.99475974,  0.35123652],
       [-2.2178693 ,  0.67798436, -1.6107789 , ..., -1.5525125 ,
        -2.2300599 ,  1.056079  ],
       [-1.7882888 ,  1.308859  , -2.4999998 , ..., -0.45123178,
        -2.4277353 ,  1.0917853 ],
       ...,
       [-1.1563656 ,  0.5925829 , -2.0635247 , ..., -0.38524142,
        -3.147508  ,  0.48102516],
       [-2.4181235 ,  0.46981514, -1.8537065 , ..., -1.5916448 ,
        -1.7628628 ,  0.7312937 ],
       [-1.4754695 ,  0.6491189 , -1.6219752 , ..., -0.7713741 ,
        -2.5889263 ,  0.66068506]], dtype=float32)

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

In [45]:
y_hat = clf.predict(scaled_test_embed)

In [47]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1250
           1       0.94      0.96      0.95      1225

    accuracy                           0.95      2475
   macro avg       0.95      0.95      0.95      2475
weighted avg       0.95      0.95      0.95      2475

