In [1]:
import spacy
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

In [2]:
# Load the Pre-Trained model
nlp = spacy.load('en_core_web_sm')

# Make a sequential palette that blends from light to color
cm = sns.light_palette('#2ecc71', as_cmap=True)

In [3]:
words = ['cat', 'dog', 'car', 'bird', 'eagle']
# Transform each word to a vector
vectors =  [nlp(word).vector for word in words]
# find similarities across all words
similarities = cosine_similarity(vectors, vectors)
pd.DataFrame(similarities, columns=words, index=words).style.background_gradient(cmap=cm)

Unnamed: 0,cat,dog,car,bird,eagle
cat,1.0,0.654956,0.614196,0.564566,0.678033
dog,0.654956,1.0,0.55902,0.507874,0.585649
car,0.614196,0.55902,1.0,0.706714,0.419438
bird,0.564566,0.507874,0.706714,1.0,0.514435
eagle,0.678033,0.585649,0.419438,0.514435,1.0


In [4]:
# Each word is represented as a vector with a length of 96
vec = nlp('cat').vector
vec.shape

(96,)

### Build a classifier for news groups

In [6]:
import numpy as np
from tqdm.auto import tqdm
from sklearn.datasets import fetch_20newsgroups
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [9]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics']

In [10]:
X_train, y_train = fetch_20newsgroups(categories=categories, remove=('header', 'footers', 'quotes'), return_X_y=True)
X_test, y_test = fetch_20newsgroups(categories=categories, remove=('header', 'footers', 'quotes'), return_X_y=True, subset='test')

In [11]:
X_train_v = np.zeros((len(X_train), 96))
X_test_v = np.zeros((len(X_test), 96))

In [12]:
# Represent training data as vectors
for i, sent in tqdm(enumerate(nlp.pipe(X_train)), total=len(X_train)):
  X_train_v[i, :] = sent.vector
# REpresenrs testing data as vectors
for i, sent in tqdm(enumerate(nlp.pipe(X_test)), total=len(X_test)):
  X_test_v[i, :] = sent.vector

  0%|          | 0/1663 [00:00<?, ?it/s]

  0%|          | 0/1106 [00:00<?, ?it/s]

In [13]:
# Build and train our model
clf = LinearSVC()
clf.fit(X_train_v, y_train)
predictions =  clf.predict(X_test_v)
print(classification_report(y_test, predictions, target_names=categories))

                        precision    recall  f1-score   support

           alt.atheism       0.65      0.46      0.54       319
soc.religion.christian       0.75      0.79      0.77       389
         comp.graphics       0.66      0.77      0.71       398

              accuracy                           0.69      1106
             macro avg       0.68      0.67      0.67      1106
          weighted avg       0.69      0.69      0.68      1106



