### Text Classification with Machine Learning,SpaCy and Scikit(Sentiment Analysis)

* Text categorization/text classification is the task of assigning predefined categories to document
* Sentiment Analysis
* Multilabel classification

In [1]:
#Load EDA packages
import pandas as pd


In [5]:
#Load our dataset
df_yelp=pd.read_table('/home/pandit/DataScience/NLP/NLP_Projects/sentiment labelled sentences/yelp_labelled.txt')
df_imdb=pd.read_table('/home/pandit/DataScience/NLP/NLP_Projects/sentiment labelled sentences/imdb_labelled.txt')
df_amz=pd.read_table('/home/pandit/DataScience/NLP/NLP_Projects/sentiment labelled sentences/amazon_cells_labelled.txt')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [6]:
#Concatenate our Datasets
frames=[df_yelp,df_imdb,df_amz]

In [7]:
df_yelp.columns

Index(['Wow... Loved this place.', '1'], dtype='object')

In [8]:
#Renaming Column Headers
for colname in frames:
    print(colname.columns)

Index(['Wow... Loved this place.', '1'], dtype='object')
Index(['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ', '0'], dtype='object')
Index(['So there is no way for me to plug it in here in the US unless I go by a converter.', '0'], dtype='object')


In [9]:
#Assign a Key to Make it Easier
keys=['Yelp','IMDB','Amazon']

In [10]:
#Merge or Concat our Datasets
df=pd.concat(frames,keys=keys)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [11]:
df.shape

(2745, 5)

In [12]:
df.head()

Unnamed: 0,Unnamed: 1,0,1,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",So there is no way for me to plug it in here in the US unless I go by a converter.,Wow... Loved this place.
Yelp,0,,0.0,,,Crust is not good.
Yelp,1,,0.0,,,Not tasty and the texture was just nasty.
Yelp,2,,1.0,,,Stopped by during the late May bank holiday of...
Yelp,3,,1.0,,,The selection on the menu was great and so wer...
Yelp,4,,0.0,,,Now I am getting angry and I want my damn pho.


In [13]:
df.to_csv("sentimentdataset1.csv")

In [14]:
#Data Cleaning
df.columns

Index(['0', '1',
       'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
       'So there is no way for me to plug it in here in the US unless I go by a converter.',
       'Wow... Loved this place.'],
      dtype='object')

In [15]:
# Checking for Missing Values
df.isnull().sum()

0                                                                                           999
1                                                                                          1746
A very, very, very slow-moving, aimless movie about a distressed, drifting young man.      1998
So there is no way for me to plug it in here in the US unless I go by a converter.         1746
Wow... Loved this place.                                                                   1746
dtype: int64

#### Working with Spacy
* Removing Stopwords

* Lemmmatizing

In [16]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp=spacy.load('en')

In [17]:
# Build a list of stopwords to use to filter
stopwords=list(STOP_WORDS)

In [18]:
stopwords

['either',
 'below',
 'top',
 'besides',
 'say',
 'than',
 'both',
 'least',
 'anyway',
 'yet',
 'everything',
 'next',
 'first',
 'onto',
 'every',
 'into',
 'nothing',
 'they',
 'those',
 'themselves',
 'thereby',
 'back',
 'per',
 'since',
 'really',
 'seem',
 'sixty',
 'thence',
 'any',
 'also',
 'the',
 'each',
 'just',
 'whereafter',
 'eight',
 'among',
 'them',
 'might',
 'but',
 'down',
 'whenever',
 'anywhere',
 'give',
 'elsewhere',
 'hereupon',
 'made',
 'for',
 'quite',
 'whereupon',
 'some',
 'ourselves',
 'from',
 '’s',
 '’m',
 'put',
 'about',
 'else',
 'beside',
 '’ll',
 "'s",
 'without',
 'our',
 'whither',
 'same',
 'hers',
 'becoming',
 'herself',
 'everywhere',
 'perhaps',
 'whence',
 'at',
 'where',
 'being',
 'thus',
 'another',
 'to',
 'across',
 'whereby',
 "n't",
 'anyone',
 'few',
 'forty',
 'serious',
 'mostly',
 'or',
 'became',
 'most',
 'as',
 'indeed',
 'behind',
 'take',
 'almost',
 'been',
 'two',
 'above',
 'by',
 'nor',
 'we',
 'hereby',
 'well',
 'si

#### Getting Lemma and Stop words

In [19]:
docx=nlp("This is how John walker was walking. He was also running beside lawns")

In [20]:
#Lemmatizing of tokens
for word in docx:
    print(word.text,"Lemma=",word.lemma_)

This Lemma= this
is Lemma= be
how Lemma= how
John Lemma= John
walker Lemma= walker
was Lemma= be
walking Lemma= walk
. Lemma= .
He Lemma= -PRON-
was Lemma= be
also Lemma= also
running Lemma= run
beside Lemma= beside
lawns Lemma= lawn


In [21]:
#Lemma that are not pronouns
for word in docx:
    if word.lemma_ !="-PRON-":
        print(word.lemma_.lower().strip())

this
be
how
john
walker
be
walk
.
be
also
run
beside
lawn


In [22]:
#Filterning stopwords
for word in docx:
    if word.is_stop == False and not word.is_punct:
        print(word)

John
walker
walking
running
lawns


In [23]:
#Creating a Spacy Parser
import string
punctuations=string.punctuation

In [24]:
from spacy.lang.en import English
parser=English()

In [25]:
def spacy_tokenizer(sentence):
    mytokens=parser(sentence)
    mytokens=[word.lemma_.lower().strip() if word.lemma_ !="-PRON-" else word.lower_ for word in mytokens]
    mytokens=[word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

#### Machine Learning With Sklearn

In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [27]:
#Custom transformer using SpaCy
class predictors(TransformerMixin):
    def transform(self,X,**transform_params):
        return [clean_text(text) for text in X]
    def fit(self,X,y=None,**fit_params):
        return self
    def get_params(self,deep=True):
        return {}
    
#Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [28]:
#Vectorization
vectorizer=CountVectorizer(tokenizer=spacy_tokenizer,ngram_range=(1,1))
classifier=LinearSVC()

In [29]:
#Using Tfidf
tfvectorizer=TfidfVectorizer(tokenizer=spacy_tokenizer)

In [30]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [33]:
# Features and Labels
X=df['Message']
ylabels=df['Target']

KeyError: 'Message'

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,ylabels,test_size=0.2,random_state=30)

In [None]:
#Create the pipeline to clean,tokenize, vectorize
pipe=Pipeline([("Cleaner",predictors()),
              ("vectorizer",vectorizer),
              ("classifier",classifier)])

In [None]:
#Fit our data
pipe.fit(X_train,y_train)

In [None]:
#predicating with a test dataset
sample_predication=pipe.predict(X_test)

In [None]:
for(sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction",pred)

In [None]:
#Accuracy
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_predication))

In [None]:
#Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))


In [None]:
pipe.predict(example)