<a href="https://colab.research.google.com/github/olinyoder2534/NLP_practice/blob/main/WordVectorsSpacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import spacy

In [5]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
nlp = spacy.load("en_core_web_lg")

In [7]:
doc = nlp("Dog cat good great bad terrible walk run fast slow speedy famish")

In [8]:
for token in doc:
  print(token.text, '|', token.has_vector, '|', token.is_oov)

Dog | True | False
cat | True | False
good | True | False
great | True | False
bad | True | False
terrible | True | False
walk | True | False
run | True | False
fast | True | False
slow | True | False
speedy | True | False
famish | False | True


In [9]:
word = 'Crazy'
doc1 = nlp(word)

In [10]:
for token in doc:
  print("{}: {}: {}".format(token.text, doc1.text, token.similarity(doc1)))

Dog: Crazy: 0.4346887903139159
cat: Crazy: 0.09294428820371692
good: Crazy: 0.03709459280984394
great: Crazy: -0.010456866901729823
bad: Crazy: 0.153386282670883
terrible: Crazy: 0.03148786201781615
walk: Crazy: -0.04340720567471733
run: Crazy: -0.04556908471371835
fast: Crazy: 0.05295174389117186
slow: Crazy: 0.02988633969001308
speedy: Crazy: -0.07070519744084651
famish: Crazy: 0.0


  print("{}: {}: {}".format(token.text, doc1.text, token.similarity(doc1)))


In [11]:
def similarity(base_word, words_to_compare):
  base_token = nlp(base_word)
  doc = nlp(words_to_compare)

  for token in doc:
    print("{}: {}: {}".format(token.text, base_token.text, token.similarity(base_token)))

In [12]:
similarity('ball', 'ate played basketball round flat square')

ate: ball: -0.0715692959914552
played: ball: 0.2822695810138041
basketball: ball: 0.38524151911954396
round: ball: 0.41335365463054796
flat: ball: 0.2607245255256584
square: ball: 0.2321839869731484


In [13]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)

In [23]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('/content/Fake_Real_Data.csv')

In [17]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [18]:
df.shape

(9900, 2)

In [19]:
df['label'].value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [20]:
df['label_num'] = df['label'].map({'Fake' : 0, 'Real': 1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [21]:
df['vector'] = df['Text'].apply(lambda text: nlp(text).vector)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.label_num,
    test_size=0.2,
    random_state=42
)

In [25]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

clf = Pipeline([
     ('scaler', MinMaxScaler()),
     ('Multi NB', MultinomialNB())
])

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       973
           1       0.94      0.96      0.95      1007

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [28]:
#trying KNN bc of lower dimensionality
from  sklearn.neighbors import KNeighborsClassifier

clf = Pipeline([
     ('KNN',  KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'))
])

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       973
           1       0.99      1.00      0.99      1007

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

clf = Pipeline([
     ('scaler', StandardScaler()),
     ('RF', RandomForestClassifier())
])

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       973
           1       0.99      1.00      0.99      1007

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [49]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [70]:
news = pd.read_json('/content/news_dataset.json')

In [71]:
news = news.head(1000)
news['category'].value_counts()

category
BUSINESS    328
SPORTS      319
CRIME       240
SCIENCE     113
Name: count, dtype: int64

In [78]:
news['preprocessed_txt'] = news['text'].apply(preprocess)

In [79]:
news.head()

Unnamed: 0,text,category,catNum,preprocessed_txt
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...


In [80]:
news['category'].value_counts()

category
BUSINESS    328
SPORTS      319
CRIME       240
SCIENCE     113
Name: count, dtype: int64

In [81]:
news['catNum'] = news['category'].map({'BUSINESS' : 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3})
news.head()

Unnamed: 0,text,category,catNum,preprocessed_txt
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...


In [82]:
news['vector'] = news['preprocessed_txt'].apply(lambda preprocessed_txt: nlp(preprocessed_txt).vector)

In [83]:
news.head()

Unnamed: 0,text,category,catNum,preprocessed_txt,vector
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...,"[-0.85190785, 1.0438694, -0.9148885, -1.395817..."
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake,"[0.60747343, 1.9251899, -0.16949336, -0.573053..."
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...,"[0.088981755, 0.5882564, -1.2281352, -0.320762..."
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...,"[-1.0280653, 4.349204, -1.06896, -1.045683, 1...."
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...,"[-1.4220493, 0.9367255, -1.8070079, 3.1870718,..."


In [84]:
from imblearn.over_sampling import SMOTE

In [85]:
X = news['vector']
y = news['catNum']

In [86]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=news.catNum
)

In [87]:
X[0]

array([-0.85190785,  1.0438694 , -0.9148885 , -1.3958178 ,  1.484203  ,
        0.45393246,  1.5413097 ,  1.712773  , -0.3140326 , -0.36543056,
        3.002751  ,  0.16484107, -1.629855  ,  1.8627633 ,  1.9165852 ,
        0.40945596,  2.3969076 , -0.15216422, -0.6622616 ,  0.6162403 ,
       -0.9076868 ,  0.82945234, -1.3721215 ,  0.0711563 , -0.8391779 ,
        0.16487055, -2.9074738 , -0.03810368,  0.05046581,  1.4723557 ,
       -0.1867616 ,  2.0198634 , -0.12493279, -0.92121947, -1.1122949 ,
       -1.4551967 ,  1.14518   ,  1.1708809 ,  1.1491411 ,  0.649261  ,
       -1.2577931 , -0.7662145 ,  1.042321  ,  1.951568  , -0.92035455,
        0.8514947 ,  1.1972532 , -1.2999147 ,  0.51799184, -0.48446158,
       -0.62011164,  3.379827  , -0.06305154, -1.2626609 , -1.0141442 ,
        1.0211505 , -1.2194699 ,  0.20853116,  1.11354   ,  0.33777162,
        1.3333789 ,  0.04249209, -1.4196147 , -0.8116358 ,  2.205014  ,
        1.5702744 , -3.2042918 , -1.6763735 ,  0.5764552 ,  0.59

In [88]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [89]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_2d)
X_test_scaled = scaler.transform(X_test_2d)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [90]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87        65
           1       0.88      0.89      0.88        64
           2       0.85      0.83      0.84        48
           3       0.83      0.65      0.73        23

    accuracy                           0.85       200
   macro avg       0.85      0.82      0.83       200
weighted avg       0.85      0.85      0.85       200



In [91]:
clf = RandomForestClassifier()
clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85        65
           1       0.81      0.88      0.84        64
           2       0.78      0.83      0.81        48
           3       0.67      0.43      0.53        23

    accuracy                           0.81       200
   macro avg       0.78      0.75      0.76       200
weighted avg       0.80      0.81      0.80       200

