# NLP Features Engineering

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = [[29.0, 0, 0, 211.3375, 'female', 1],
       [0.9167, 1, 2, 151.55, 'male', 1],
       [2.0, 1, 2, 151.55, 'female', 0],
       [30.0, 1, 2, 151.55, 'male', 0],
       [25.0, 1, 2, 151.55, 'female', 0],
       [48.0, 0, 0, 26.55, 'male', 1],
       [63.0, 1, 0, 77.9583, 'female', 1],
       [39.0, 0, 0, 0.0, 'male', 0],
       [53.0, 2, 0, 51.4792, 'female', 1],
       [71.0, 0, 0, 49.5042, 'male', 0],
       [47.0, 1, 0, 227.525, 'male', 0],
       [18.0, 1, 0, 227.525, 'female', 1],
       [24.0, 0, 0, 69.3, 'female', 1],
       [26.0, 0, 0, 78.85, 'female', 1],
       [80.0, 0, 0, 30.0, 'male', 1]]

df1 = pd.DataFrame(data, columns=['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'label'])

`pd.get_dummies()` with k-1 encoding use drop_first=True) = sklearn.preprocessing.LabelEncoder

In [6]:
pd.get_dummies(df1, columns=['feature 5'], drop_first=True)

Unnamed: 0,feature 1,feature 2,feature 3,feature 4,label,feature 5_male
0,29.0,0,0,211.3375,1,0
1,0.9167,1,2,151.55,1,1
2,2.0,1,2,151.55,0,0
3,30.0,1,2,151.55,0,1
4,25.0,1,2,151.55,0,0
5,48.0,0,0,26.55,1,1
6,63.0,1,0,77.9583,1,0
7,39.0,0,0,0.0,0,1
8,53.0,2,0,51.4792,1,0
9,71.0,0,0,49.5042,0,1


In [3]:
tweet = pd.read_csv('../dataset/vaccination_tweets.csv')

Charactor count

In [14]:
# 1
tweet['text'].apply(len)

0        97
1       140
2       140
3       140
4       135
       ... 
9441     77
9442    140
9443    140
9444    140
9445    110
Name: text, Length: 9446, dtype: int64

In [15]:
# 2
tweet['text'].str.len()

0        97
1       140
2       140
3       140
4       135
       ... 
9441     77
9442    140
9443    140
9444    140
9445    110
Name: text, Length: 9446, dtype: int64

Word count

In [21]:
#1
tweet['text'].str.split(' ').str.len()

0       12
1       21
2       15
3       20
4       14
        ..
9441    11
9442    16
9443    15
9444    17
9445    11
Name: text, Length: 9446, dtype: int64

In [8]:
#2
def count_words(sent: str) -> int:
    words = sent.split(' ')
    return len(words)

tweet['text'].apply(count_words)

0       12
1       21
2       15
3       20
4       14
        ..
9441    11
9442    16
9443    15
9444    17
9445    11
Name: text, Length: 9446, dtype: int64

Hashtag

In [16]:
def count_hastag(sent: str) -> int:
    words = sent.split(' ')
    hashed = [word for word in words if word.startswith('#')]
    return len(hashed)

def count_tag(sent: str) -> int:
    words = sent.split(' ')
    hashed = [word for word in words if word.startswith('@')]
    return len(hashed)

tweet['text'].apply(count_hastag)[:5]

0    1
1    0
2    6
3    0
4    2
Name: text, dtype: int64

In [18]:
tweet['text'].apply(count_tag)[:5]

0    0
1    0
2    0
3    0
4    2
Name: text, dtype: int64

In [2]:
import textstat

In [3]:
text = 'In multilabel learning, the joint set of binary classification tasks is expressed with a label binary indicator array: each sample is one row of a 2d array of shape (n_samples, n_classes) with binary values where the one, i.e. the non zero elements, corresponds to the subset of labels for that sample. An array such as np.array([[1, 0, 0], [0, 1, 1], [0, 0, 0]]) represents label 0 in the first sample, labels 1 and 2 in the second sample, and no labels in the third sample.'

In [7]:
flesh_score = textstat.flesch_reading_ease(text)
gunn_score = textstat.gunning_fog(text)

In [9]:
flesh_score

36.29

In [8]:
gunn_score

20.46

## Tokenization & Lemmatization

In [50]:
import spacy
with open('../dataset/nlp_ner_article.txt', 'r') as f:
    sentences = f.read()

In [51]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(sentences)

In [14]:
for token in doc[:10]:
    print(token.text)

The
taxi
-
hailing
company
Uber
brings
into
very
sharp


In [15]:
for token in doc[:10]:
    print(token.lemma_)

the
taxi
-
hail
company
Uber
bring
into
very
sharp


In [16]:
en_stop = spacy.lang.en.stop_words.STOP_WORDS

In [40]:
lemmas = [token.lemma_ for token in doc]
lemmas_a = [lemma for lemma in lemmas if lemma.isalpha() or lemma not in en_stop]

In [41]:
lemmas_a_1 = [token.lemma_ for token in doc if token.lemma_.isalpha() or token.lemma_ not in en_stop]
lemmas_a_2 = [token.lemma_ if (token.lemma_.isalpha() or token.lemma_ not in en_stop) else None for token in doc]

## Part Of Speech (POS) tagging

In [54]:
pos = [(token.text, token.pos_) for token in doc]

In [55]:
print(pos[:10])

[('The', 'DET'), ('taxi', 'NOUN'), ('-', 'PUNCT'), ('hailing', 'VERB'), ('company', 'NOUN'), ('Uber', 'PROPN'), ('brings', 'VERB'), ('into', 'ADP'), ('very', 'ADV'), ('sharp', 'ADJ')]


In [46]:
def nouns(text: str, model=nlp) -> int:
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of other nouns
    return pos.count("NOUN")
print(nouns(sentences))

93


## Name Entity Recognition (NER)

In [52]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
Uber PERSON
Travis Kalanick of Uber FAC
Tim Cook PERSON
Apple ORG
Millions CARDINAL
Uber PERSON
Silicon Valley LOC
Yahoo ORG
Marissa Mayer PERSON
186 MONEY


In [57]:
persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
persons

['Uber', 'Tim Cook', 'Uber', 'Marissa Mayer']

## N-Gram , BoW model

In [117]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


cv = CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,2))
clf = MultinomialNB()

synop = pd.read_csv('../dataset/Anime_Top10000.csv') #, usecols=['Anime_Name', 'Synopsis'])

In [161]:

synop.head()

Unnamed: 0,Anime_Name,Anime_Episodes,Anime_Air_Years,Anime_Rating,Synopsis,label
0,Fullmetal Alchemist: Brotherhood,TV (64 eps),Apr 2009 - Jul 2010,9.18,"""In order for something to be obtained, someth...",True
1,Shingeki no Kyojin Season 3 Part 2,TV (10 eps),Apr 2019 - Jul 2019,9.11,Seeking to restore humanity's diminishing hope...,True
2,Steins;Gate,TV (24 eps),Apr 2011 - Sep 2011,9.11,The self-proclaimed mad scientist Rintarou Oka...,True
3,Gintama°,TV (51 eps),Apr 2015 - Mar 2016,9.09,"Gintoki, Shinpachi, and Kagura return as the f...",True
4,Hunter x Hunter (2011),TV (148 eps),Oct 2011 - Sep 2014,9.08,Hunter x Hunter is set in a world where Hunter...,True


In [118]:
synop['label'] = synop['Anime_Rating'] > synop['Anime_Rating'].mean()

In [119]:
bow = cv.fit_transform(synop['Synopsis'])

In [86]:
bow_df = pd.DataFrame(data=bow.toarray())
bow_df.columns = cv.get_feature_names()

In [120]:
X_train, X_test, y_train, y_test = train_test_split(bow, synop['label'], stratify=synop['label'])

In [121]:
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9701333333333333

In [122]:
clf.score(X_test, y_test)

0.708

## TF-IDF Model

In [159]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, linear_kernel

In [134]:
np.array([1,3]).dot(np.array([-2,2]))

4

In [135]:
np.dot(np.array([1,3]), np.array([-2,2]))

4

In [148]:
np.array([1,3]).reshape(1,-1)@np.array([[-2],[2]])

array([[4]])

In [209]:
tfidf_vec = TfidfVectorizer(stop_words='english', max_df=0.5)

In [210]:
tfidf = tfidf_vec.fit_transform(synop['Synopsis'])

In [211]:
cosin_sim = cosine_similarity(tfidf, tfidf)

In [212]:
cosin_dist = cosine_distances(tfidf, tfidf)

In [213]:
# Cosine dist = 1 - cosine sim
cosin_sim + cosin_dist

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

pair-wise liner kernel, same result as cosine similarity but `Faster`

In [214]:
cosin_sim = linear_kernel(tfidf, tfidf)

In [215]:
indices = pd.Series(data=synop.index, index=synop['Anime_Name']).drop_duplicates()

def get_recommendation(title: str, cosine_sim: np.ndarray, indices: pd.Series):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return synop['Anime_Name'].iloc[movie_indices]

In [216]:
get_recommendation('Hunter x Hunter (2011)', cosin_sim, indices)

149                               Hunter x Hunter
8266                              Grick no Bouken
296                 Hunter x Hunter: Greed Island
5290                                Dagashi Kashi
2601       Hunter x Hunter Movie 1: Phantom Rouge
197     Hunter x Hunter: Original Video Animation
9048           Chouon Senshi Borgman: Lovers Rain
9051    Digimon Adventure 3D: Digimon Grand Prix!
2379                                  Angel Heart
2602    Hunter x Hunter Movie 2: The Last Mission
Name: Anime_Name, dtype: object

In [217]:
movies0_tfidf_label = pd.Series(data=tfidf[0].toarray().ravel(), index=tfidf_vec.get_feature_names())

In [218]:
movies0_tfidf_label.sort_values(ascending=False)

edward         0.406860
alphonse       0.352760
alchemist      0.248772
brothers       0.188060
philosopher    0.182620
                 ...   
gloomy         0.000000
gloria         0.000000
glories        0.000000
glorious       0.000000
黄色いしあわせ        0.000000
Length: 37827, dtype: float64

# Word embeddings (Word2Vec)

Word embeddings add the meaning in to vector distance  
Then same length sentence like 'I am happy' - 'I am sad' TF-IDF show close similarity, but with embeddings will show less / inverse similarity  

!python -m spacy download en_core_web_md

In [3]:
import spacy

nlp = spacy.load('en_core_web_md')

In [None]:
sent = 'I like and hate durian at the sametime'

In [2]:
doc = nlp(sent)
for token1 in doc:
    for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))

I I 1.0
I like 0.5554912
I apples 0.20442726
I and 0.31607857
I oranges 0.18824081
like I 0.5554912
like like 1.0
like apples 0.32987142
like and 0.5267484
like oranges 0.2771747
apples I 0.20442726
apples like 0.32987142
apples apples 1.0
apples and 0.24097733
apples oranges 0.77809423
and I 0.31607857
and like 0.5267484
and apples 0.24097733
and and 1.0
and oranges 0.19245948
oranges I 0.18824081
oranges like 0.2771747
oranges apples 0.77809423
oranges and 0.19245948
oranges oranges 1.0
