In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [13]:
doc = nlp("Apple, This is first sentence. Google and second one. here 3rd one.")

In [7]:
for sent in doc.sents:
    print(sent)

This is first sentence.
and second one.
here 3rd one.


In [8]:
from spacy.lang.en.stop_words import STOP_WORDS

In [11]:
stop_words =list(STOP_WORDS)

print(stop_words)

['eleven', 'while', 'became', 'say', 'from', 'around', 'another', 'latterly', 'between', 'ca', 'does', '‘s', 'should', 'nobody', 'noone', 'ours', 'the', 'herself', 'there', 'part', 'she', 'yourselves', 'my', 'serious', 'fifty', 'would', 'been', 'against', 'call', 'everything', 're', 'always', 'herein', 'which', 'most', 'amount', 'whence', 'every', 'too', 'thence', 'forty', 'to', 'top', 'seems', 'no', "'m", 'again', 'until', 'yourself', 'anyhow', 'side', 'amongst', 'moreover', 'same', 'themselves', 'five', 'whom', 'us', 'thus', '‘m', 'its', 'under', 'beforehand', 'anywhere', 'a', 'all', 'fifteen', 'once', 'and', 'had', 'anyway', 'about', 'back', 'than', 'six', 'almost', 'am', 'seeming', 'their', 'see', 'will', 'n’t', '’ve', 'besides', 'was', 'nevertheless', 'who', 'i', 'how', 'least', 'by', 'three', 'via', 'several', 'used', 'thru', 'into', 'with', 'four', 'what', 'throughout', 'well', 'becoming', 'keep', 'hers', 'might', 'first', 'not', 'put', 'up', '‘ll', 'afterwards', 'myself', '’s',

In [12]:
len(stop_words)

326

In [14]:
for token in doc:
    if token.is_stop == False:
        print(token)

Apple
,
sentence
.
Google
second
.
3rd
.


### Lemmatization

In [19]:
doc =nlp("run runs runner running")

In [20]:
for lemma in doc:
    print(lemma.text, lemma.lemma_) 

run run
runs run
runner runner
running run


### POS

In [24]:
doc =nlp("the truth is I am Ironman")

In [25]:
for token in doc:
    print(token.text, token.pos_) 

the DET
truth NOUN
is AUX
I PRON
am AUX
Ironman PROPN


In [30]:
displacy.render(doc, style="dep", options={"distance":125})

### Entity Detection

In [59]:
doc =nlp("A recent announcement by the University of Oxford that researchers Alex there had started testing a vaccine against the novel coronavirus disease (Covid-19) has raised hopes. Over the last few weeks, there have also been somewhat conflicting reports about the performance of a drug candidate, remdesivir, while Israel has announced a breakthrough in another possible line of treatment, that with antibodies.")

In [60]:
doc

A recent announcement by the University of Oxford that researchers Alex there had started testing a vaccine against the novel coronavirus disease (Covid-19) has raised hopes. Over the last few weeks, there have also been somewhat conflicting reports about the performance of a drug candidate, remdesivir, while Israel has announced a breakthrough in another possible line of treatment, that with antibodies.

In [61]:
displacy.render(doc, style="ent", options={"distance":125})

### Text classification

In [62]:
import pandas as pd

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [73]:
columns_name =["Review","Sentiment"]

In [80]:
df_imdb = pd.read_csv('imdb_labelled.txt', sep="\t", header= None,  names=columns_name)
df_amz = pd.read_csv('amazon_cells_labelled.txt' , sep="\t", header= None, names=columns_name)
df_yelp = pd.read_csv('yelp_labelled.txt' , sep="\t", header= None, names=columns_name)

In [81]:
df_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [83]:
df_yelp.shape, df_amz.shape , df_imdb.shape

((1000, 2), (1000, 2), (748, 2))

In [84]:
data = df_yelp.append([df_amz, df_imdb], ignore_index=True)
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [85]:
data.shape

(2748, 2)

In [86]:
data["Sentiment"].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [87]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

### Tokenization

In [88]:
import string

In [89]:
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [98]:
def text_data_cleaning(sentence):
    doc =nlp(sentence)
    
    tokens=[]
    for token in doc:
        if token.lemma_ !="-PRON-":
            temp =token.lemma_.lower().strip()
        else:
            temp =token.lower_
        tokens.append(temp)
        
        
    cleaned_tokens =[]
    for token in tokens:
        if token not in STOP_WORDS and token not in punct:
            cleaned_tokens.append(token)
        return cleaned_tokens

In [101]:
text_data_cleaning("hello how are you like this video")

['hello']

### Vectorization feature Engineering(TF-IDF)

In [102]:
from sklearn.svm import LinearSVC

In [104]:
tfidf =TfidfVectorizer(tokenizer= text_data_cleaning)
classifier = LinearSVC()

In [108]:
X =data["Review"]
y =data["Sentiment"]

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
X_train.shape,y_train.shape

((2198,), (2198,))

In [111]:
X_test.shape,y_test.shape

((550,), (550,))

In [112]:
clf = Pipeline([("tfidf",tfidf),("clf",classifier)])

In [113]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_data_cleaning at 0x0000026201BF9598>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_inter

In [114]:
y_pred =clf.predict(X_test)

In [116]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.98      0.70       285
           1       0.86      0.14      0.23       265

    accuracy                           0.57       550
   macro avg       0.70      0.56      0.47       550
weighted avg       0.70      0.57      0.48       550



In [117]:
confusion_matrix(y_test,y_pred)

array([[279,   6],
       [229,  36]], dtype=int64)

In [118]:
clf.predict(["good product"])

array([1], dtype=int64)

In [119]:
clf.predict(["bad product"])

array([0], dtype=int64)

In [122]:
clf.predict(["i don't like product"])

array([0], dtype=int64)

In [126]:
clf.predict(["best product"])

array([1], dtype=int64)