### Text Classification With Machine Learning and SpaCy
+ Text categorization / text classification is the task of assigning predefined categories to documents.
+ Sentiment Analysis
+ Multilabel classification
+ + DataSet source http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

##### Aim is to classify reviews into positive or negative review


In [1]:
# Load EDA packages
import pandas as pd

In [2]:
# Load our dataset
df_yelp = pd.read_table('yelp_labelled.txt')
df_imdb = pd.read_table('imdb_labelled.txt')
df_amz = pd.read_table('amazon_cells_labelled.txt')


In [3]:
# Concatenate our Datasets
frames = [df_yelp,df_imdb,df_amz]

In [4]:
# Renaming Column Headers
for colname in frames:
    colname.columns = ["Message","Target"]

In [5]:
# Column names
for colname in frames:
    print(colname.columns)

Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')


In [6]:
# Assign a Key to Make it Easier
keys = ['Yelp','IMDB','Amazon']

In [7]:
# Merge or Concat our Datasets
df = pd.concat(frames,keys=keys)

In [8]:
# Length and Shape 
df.shape

(2745, 2)

In [9]:
df.head()

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0


In [10]:
df.to_csv("sentimentdataset.csv")

In [11]:
# Data Cleaning
df.columns

Index(['Message', 'Target'], dtype='object')

In [12]:
# Checking for Missing Values
df.isnull().sum()

Message    0
Target     0
dtype: int64

###  Working with SpaCy
+ Removing Stopwords
+ Lemmatizing

In [13]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

OSError: [E941] Can't find model 'en'. It looks like you're trying to load a model from a shortcut, which is deprecated as of spaCy v3.0. To load the model, use its full name instead:

nlp = spacy.load("en_core_web_sm")

For more details on the available models, see the models directory: https://spacy.io/models. If you want to create a blank model, use spacy.blank: nlp = spacy.blank("en")

In [None]:
# Build a list of stopwords to use to filter
stopwords = list(STOP_WORDS)

In [None]:
stopwords

##### Getting Lemma and Stop words

In [None]:
docx = nlp("This is how John Walker was walking. He was also running beside the lawn.")

In [None]:
# Lemmatizing of tokens
for word in docx:
    print(word.text,"Lemma =>",word.lemma_)
    

In [None]:
# Lemma that are not pronouns
for word in docx:
    if word.lemma_ != "-PRON-":
        print(word.lemma_.lower().strip())

In [None]:
# List Comprehensions of our Lemma
[word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in docx]

In [None]:
# Filtering out Stopwords and Punctuations
for word in docx:
    if word.is_stop == False and not word.is_punct:
#     if word.is_stop != True and not word.is_punct:
        print(word)

In [None]:
# Stop words and Punctuation In List Comprehension
[ word for word in docx if word.is_stop == False and not word.is_punct ]

In [None]:
# Use the punctuations of string module
import string
punctuations = string.punctuation

In [None]:
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [None]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

#### Machine Learning With SKlearn

In [None]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [None]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [None]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [None]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [None]:
# Features and Labels
X = df['Message']
ylabels = df['Target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [None]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [None]:
# Fit our data
pipe.fit(X_train,y_train)

In [None]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

In [None]:
# Prediction Results
# 1 = Positive review
# 0 = Negative review
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)

In [None]:
# Accuracy
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

In [None]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))


In [None]:
# Another random review
pipe.predict(["This was a great movie"])

In [None]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!"]
       

In [None]:
pipe.predict(example)

In [None]:
#### Using Tfid

In [None]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe_tfid = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [None]:
pipe_tfid.fit(X_train,y_train)

In [None]:
sample_prediction1 = pipe_tfid.predict(X_test)

In [None]:
for (sample,pred) in zip(X_test,sample_prediction1):
    print(sample,"Prediction=>", pred)

In [None]:
print("Accuracy: ",pipe_tfid.score(X_test,y_test))
print("Accuracy: ",pipe_tfid.score(X_test,sample_prediction1))

In [None]:
### Jesse JCharis
### J-Secur1ty
### Jesus Saves @ JCharisTech