# Text Classification using word vectors

In [4]:
import pandas as pd 
df = pd.read_csv("Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [5]:
df.shape

(9900, 2)

In [7]:
df.size

19800

In [9]:
df['label'].value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [10]:
df["label_num"] = df["label"].map({
    'Fake':0,
    'Real':1
})

In [11]:
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [12]:
import spacy
nlp=spacy.load("en_core_web_lg")

In [13]:
doc=nlp("top companies of tech is in california")
doc.vector.shape

(300,)

##### simillarly we need to generate vector for all rows of text present if dataframe 
##### we will add one new column which will store the vector of that particular row

In [15]:
df['vector']=df['Text'].apply(lambda x: nlp(x).vector)

In [16]:
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-2.7812982, -0.16120885, -1.609772, 1.3624227..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ..."


In [49]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(
df.vector.values,
df.label_num, 
test_size=0.2,
# stratify=df.label_num,
random_state=2023)

In [50]:
X_train
#when we are giving it to our model for training it expecting 2D array but it is 
#not in 2d array here array inside array exist 
# we need to convert it to 2d arrray

array([array([-1.5668225 ,  0.7193005 , -1.3984917 ,  0.52727246,  5.070563  ,
               0.3672485 ,  0.15668513,  2.649323  , -0.23730232, -1.808466  ,
               5.616688  ,  1.5052005 , -3.0761356 ,  0.37266147,  0.37555665,
               2.157936  ,  1.5231117 , -0.05031684, -1.1941004 , -1.1107302 ,
               1.4914346 , -1.564488  , -1.2656869 , -0.06108946,  0.47364494,
              -1.2051016 , -1.7416915 , -0.5384246 , -0.30893984,  0.5933381 ,
               0.64572483, -0.24970052, -0.9036851 , -1.942812  , -2.723214  ,
              -1.0034223 , -0.6274632 ,  0.4470563 ,  0.59023315,  0.5031633 ,
               0.72697556,  0.09279276, -0.12896812, -0.01897464, -2.098799  ,
               0.675709  ,  0.6316914 , -1.7015005 , -1.3026391 ,  1.5715567 ,
              -1.508045  ,  1.3969685 ,  0.48237985, -4.663938  , -0.4597945 ,
               0.5374707 , -0.25118768,  0.43868366,  0.80715424, -0.5495164 ,
               0.65371925, -1.3936385 ,  0.09246314,

* converting to 2d numpy array
* function np.stack convert it to 2d array when data is in array is it self an array

In [51]:
import numpy as np
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

#### applying naive_bayes Classifier

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
# we  have some data in negative scale also so, model will not accept negative scale data
# so , for that reason we are applying minmax scaler 
scaler =MinMaxScaler()

scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.fit_transform(X_test_2d)

clf = MultinomialNB()
clf.fit(scaled_train_embed,y_train)

In [53]:
y_pred=clf.predict(scaled_test_embed)

In [54]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1004
           1       0.97      0.93      0.95       976

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



#### applying KNN Classifier

In [57]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
clf.fit(scaled_train_embed,y_train)
y_pred=clf.predict(scaled_test_embed)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1004
           1       0.99      0.99      0.99       976

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



**conclusion : with KNN classifier we are getting almost 1 precision and recall that is best performance 
    but earlier when we were trying on tf-idf or BOW or n-gram we were getting horible result 
    because KNN has problem with more dimensions here dimensions reduced and hence we are getting best result**