In [1]:
# word embedding techniques:
# Based on CBOW and skip gram: Word2vec, GloVe, fastText
# Based on transformer architecture: BERT, GPT
# Based on LSTM: ELMo

In [11]:
import numpy as np
import pandas as pd
import spacy
import gensim
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#!python3 -m spacy download en_core_web_lg
nlp=spacy.load('en_core_web_lg')

In [17]:
doc = nlp("dog cat banana kem")
for token in doc:
    print(f"{token.text}, Vector:{token.has_vector} , OOV:{token.is_oov}")

dog, Vector:True , OOV:False
cat, Vector:True , OOV:False
banana, Vector:True , OOV:False
kem, Vector:False , OOV:True


In [18]:
doc[0].vector.shape

(300,)

In [19]:
base_token=nlp("apple")
for token in doc:
    print(f"{token.text}-{base_token.text}:{token.similarity(base_token)}")

dog-apple:0.22881005140483499
cat-apple:0.20368060357742446
banana-apple:0.6646700599790674
kem-apple:0.0


  print(f"{token.text}-{base_token.text}:{token.similarity(base_token)}")


In [20]:
cosine_similarity([token.vector for token in doc])

array([[1.0000001 , 0.8220816 , 0.20909055, 0.        ],
       [0.8220816 , 0.9999999 , 0.22358824, 0.        ],
       [0.20909055, 0.22358824, 0.99999994, 0.        ],
       [0.        , 0.        , 0.        , 0.        ]], dtype=float32)

In [21]:
doc_vector=[token.vector for token in doc]
word_vector=base_token.vector

In [22]:
cosine_similarities = [cosine_similarity([v, word_vector])[0][1] for v in doc_vector]
cosine_similarities

[0.22881007, 0.20368066, 0.66467005, 0.0]

In [34]:
cosine_similarity([nlp.vocab["bad"].vector, nlp.vocab["good"].vector])

array([[0.9999997, 0.739189 ],
       [0.739189 , 0.9999996]], dtype=float32)

# Text classification using spacy

In [3]:
df=pd.read_csv("Fake_Real_Data.txt")
df["label_num"]=df.label.apply(lambda x:1 if x=="Real" else 0)
df["Text_vector"]=df["Text"].apply(lambda x: nlp(x).vector)

In [5]:
df.to_csv("Fake_Real_Data_new.txt")

In [7]:
x_train,x_test,y_train, y_test=train_test_split(df["Text_vector"],df.label_num,stratify=df.label_num,random_state=120)

In [8]:
x_train_2d=np.stack(x_train)
x_test_2d=np.stack(x_test)

scaler=MinMaxScaler()
x_train_scale=scaler.fit_transform(x_train_2d)
x_test_scale=scaler.transform(x_test_2d)

In [10]:
model= MultinomialNB()
model.fit(x_train_scale,y_train)
ypred=model.predict(x_test_scale)
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1250
           1       0.94      0.95      0.95      1225

    accuracy                           0.95      2475
   macro avg       0.95      0.95      0.95      2475
weighted avg       0.95      0.95      0.95      2475



In [12]:
model= KNeighborsClassifier(n_neighbors=10)
model.fit(x_train_scale,y_train)
ypred=model.predict(x_test_scale)
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1250
           1       0.99      0.99      0.99      1225

    accuracy                           0.99      2475
   macro avg       0.99      0.99      0.99      2475
weighted avg       0.99      0.99      0.99      2475



Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

