In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
np.random.seed(500)

In [3]:
#Corpus = pd.read_csv("corpus.csv",encoding='latin-1')
Corpus_full = pd.read_csv("corpus.csv",encoding='latin-1')
Corpus=Corpus_full.iloc[:1000,]

##### STEP -4: Data pre-processing
This is an important step in any data mining process. 
This basically involves transforming raw data into an understandable format for NLP models. Real-world data is often incomplete, inconsistent, and/or lacking in certain behaviors or trends, and is likely to contain many errors. Data pre-processing is a proven method of resolving such issues.This will help in getting better results through the classification algorithms.

Below, I have explained the two techniques that are also performed besides other easy to understand steps in data pre-processing:

######  Tokenization: 


This is a process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. The list of tokens becomes input for further processing. NLTK Library has word_tokenize and sent_tokenize to easily break a stream of text into a list of words or sentences, respectively.


###### Word Stemming/Lemmatization: 


The aim of both processes is the same, reducing the inflectional forms of each word into a common base or root. Lemmatization is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.


Here’s the complete script which performs the aforementioned data pre-processing steps, you can always add or remove steps which best suits the data set you are dealing with:

* Remove Blank rows in Data, if any
* Change all the text to lower case
* Word Tokenization
* Remove Stop words
* Remove Non-alpha text
* Word Lemmatization


In [4]:
# Step - a : Remove blank rows if any.
print(Corpus.isnull().any()) # gives False. 

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text_lower']=[doc.lower() for doc in Corpus.text]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text_final']=[word_tokenize(doc) for doc in Corpus.text_lower]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
lemmatizer = WordNetLemmatizer()
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wn.ADJ
    elif nltk_tag.startswith('V'):
        return wn.VERB
    elif nltk_tag.startswith('N'):
        return wn.NOUN
    elif nltk_tag.startswith('R'):
        return wn.ADV
    else:          
        return wn.NOUN
for i in range(len(Corpus.text_final)):
    Corpus.text_final[i]=[t for t in Corpus.text_final[i] if t.isalpha() and t not in stopwords.words('english')]
    Corpus.text_final[i]=[lemmatizer.lemmatize(word, nltk_tag_to_wordnet_tag(tag)) for word,tag in pos_tag(Corpus.text_final[i])]
    Corpus.text_final[i]=str(Corpus.text_final[i])

text     False
label    False
dtype: bool


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


##### Prepare Test/ Train/ Datasets

In [5]:
X=Corpus['text_final']
y=Corpus['label']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.3, stratify=y)

##### STEP -6: Encoding
Label encode the target variable — This is done to transform Categorical data of string type in the data set into numerical values which the model can understand.

In [6]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.transform(y_test)

In [7]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(X)
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [115]:
print(Tfidf_vect.vocabulary_)



In [103]:
print(Train_X_Tfidf)

  (0, 87)	0.1808238451603405
  (0, 89)	0.11560881699841377
  (0, 424)	0.1350002271046054
  (0, 558)	0.1747736855852887
  (0, 698)	0.2540326571758145
  (0, 1086)	0.15642090782184911
  (0, 1087)	0.15753865238934991
  (0, 1356)	0.36619197624266014
  (0, 1779)	0.22962971983732317
  (0, 1936)	0.13429011344434735
  (0, 2133)	0.22962971983732317
  (0, 2268)	0.13932812127066152
  (0, 2281)	0.08186304199692036
  (0, 2524)	0.14684721033698064
  (0, 2641)	0.1553375609089473
  (0, 2906)	0.2540326571758145
  (0, 3057)	0.1593640195421048
  (0, 3435)	0.15037074824679736
  (0, 4473)	0.4795157078506979
  (0, 4475)	0.2540326571758145
  (0, 4838)	0.19095197924836627
  (0, 4931)	0.08637557743123903
  (0, 4943)	0.10496671762641496
  (1, 182)	0.19650973579843553
  (1, 208)	0.19650973579843553
  :	:
  (698, 4249)	0.1389649164939423
  (698, 4492)	0.27101763602641227
  (698, 4797)	0.28056824138362735
  (698, 4931)	0.10194370668750809
  (699, 234)	0.17620148998541865
  (699, 424)	0.1375240578496235
  (699, 495)

##### STEP -7: Use the ML Algorithms

In [122]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  80.0


***STEP -8: Cross-validation***

In [124]:
from sklearn.model_selection import cross_val_score
cv_scores =cross_val_score(SVM, Train_X_Tfidf, y_train, cv=5)
cv_scores

array([0.72142857, 0.82142857, 0.71428571, 0.74285714, 0.77142857])

## Trying out other classification models

    - LinearSVC (faster SVM than SVC, but no kernel support)
    - Trying different kernels using SVC
    - SGDClassifier (Can handle large amounts of data, for 'online' model, out-of-core support.)
    - Logistic Regression
    - Desion Tree
    - Random Forest
    - Naive Bayes
    - KNN

### LinearSVC

In [32]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(C=1, loss="hinge") #use dual=False when n_samples>n_features
linear_svc.fit(Train_X_Tfidf,y_train)

predictions_lin = linear_svc.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(predictions_lin, y_test)*100)

cv_scores =cross_val_score(linear_svc, Train_X_Tfidf, y_train, cv=5)
cv_scores

 Accuracy Score ->  80.0


array([0.72142857, 0.82142857, 0.71428571, 0.74285714, 0.77142857])

### switching out kernels

In [36]:
poly = svm.SVC(kernel='poly')
poly.fit(Train_X_Tfidf,y_train)

predictions_poly = poly.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(predictions_poly, y_test)*100)

cv_scores =cross_val_score(poly, Train_X_Tfidf, y_train, cv=5)
cv_scores

 Accuracy Score ->  56.99999999999999


array([0.56428571, 0.56428571, 0.54285714, 0.53571429, 0.55      ])

In [37]:
rbf = svm.SVC() #default is 'rbf'
rbf.fit(Train_X_Tfidf,y_train)

predictions_rbf = rbf.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(predictions_rbf, y_test)*100)

cv_scores =cross_val_score(rbf, Train_X_Tfidf, y_train, cv=5)
cv_scores

 Accuracy Score ->  78.66666666666666


array([0.77142857, 0.76428571, 0.72857143, 0.78571429, 0.77142857])

In [38]:
sig = svm.SVC(kernel='sigmoid')
sig.fit(Train_X_Tfidf,y_train)

predictions_sig = sig.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(predictions_sig, y_test)*100)

cv_scores =cross_val_score(sig, Train_X_Tfidf, y_train, cv=5)
cv_scores

 Accuracy Score ->  81.0


array([0.73571429, 0.8       , 0.70714286, 0.76428571, 0.78571429])

### SGDClassifier : Binary Classifier

In [10]:
"""
SGDClassifier(loss="hinge",alpha=1/(m*C)). 
This applies regular Stochastic Gradient Descent to train a linear SVM classifier. It does not converge as fast as the LinearSVC class, but it
can be useful to handle huge datasets that do not fit in memory (out-of-core training), or to handle online classification tasks.

"""
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(Train_X_Tfidf,y_train)

y_pred_sgd=sgd_clf.predict(Test_X_Tfidf)
print("SGD Accuracy Score -> ",accuracy_score(y_pred_sgd, y_test)*100)

from sklearn.model_selection import cross_val_score
cv_scores =cross_val_score(sgd_clf, Train_X_Tfidf, y_train, cv=5)
cv_scores

SGD Accuracy Score ->  77.33333333333333


### Logistic Regression

In [148]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(Train_X_Tfidf,y_train)

y_pred_lr=log_reg.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(y_pred_lr, y_test)*100)

cv_scores =cross_val_score(log_reg, Train_X_Tfidf, y_train, cv=5)
cv_scores

 Accuracy Score ->  79.0


array([0.77142857, 0.77857143, 0.75      , 0.78571429, 0.77142857])

In [149]:
print('model = {} @ features + {}'.format(log_reg.coef_,log_reg.intercept_))
print("\nFirst Instance: ",X_test.iloc[3])
print("Actual label: ", y_test[3])
print('\nPrediction for first instance: ', log_reg.predict(Test_X_Tfidf[3]))
print('Prediction porbability for first instance: ', log_reg.predict_proba(Test_X_Tfidf[3]))

print("\nAnother Instance: ",X_test.iloc[4])
print("Actual label: ", y_test[4])
print('\nPrediction for first instance: ', log_reg.predict(Test_X_Tfidf[4]))
print('Prediction porbability for first instance: ', log_reg.predict_proba(Test_X_Tfidf[4]))

model = [[-0.11284912 -0.0775394   0.         ... -0.03577315  0.
  -0.04145108]] @ features + [-0.08056113]

First Instance:  ['book', 'incredibly', 'bore', 'one', 'boring', 'book', 'recommend', 'read', 'introduction', 'incredibly', 'long', 'incredibly', 'bore', 'also', 'really', 'sort', 'plot', 'would', 'make', 'book', 'ca', 'put', 'personally', 'fell', 'asleep', 'four', 'different', 'time', 'read', 'book', 'much', 'high', 'expectation', 'book', 'look', 'classic', 'read', 'would', 'recommend', 'count', 'monte', 'cristo', 'book']
Actual label:  0

Prediction for first instance:  [0]
Prediction porbability for first instance:  [[0.67916059 0.32083941]]

Another Instance:  ['far', 'good', 'worn', 'day', 'far', 'love', 'still', 'little', 'bit', 'tummy', 'birth', 'son', 'month', 'ago', 'really', 'hold', 'comfortable', 'think', 'show', 'clothes', 'highly', 'recommend']
Actual label:  1

Prediction for first instance:  [1]
Prediction porbability for first instance:  [[0.28769767 0.71230233]

In [150]:
# Sort the coefficients of features in ascending and desceding order. 
# Extract the indexes of these features into inds_ascending and inds_descending
inds_ascending = np.argsort(log_reg.coef_.flatten()) 
inds_descending = inds_ascending[::-1]

In [151]:
# find the words corresponding to lowest coefficients
for i in range(10):
    for key, value in Tfidf_vect.vocabulary_.items():
        if value==inds_ascending[i]:
            print(key)

bad
waste
bore
write
poor
thing
hard
higgins
plot
money


In [152]:
# find the words corresponding to highest coefficients
for i in range(10):
    for key, value in Tfidf_vect.vocabulary_.items():
        if value==inds_descending[i]:
            print(key)

love
great
best
excellent
wonderful
easy
good
henry
game
recommend


### Logistic regression with L1 penalty

In [145]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver='liblinear',penalty='l1')
log_reg.fit(Train_X_Tfidf,y_train)

y_pred_lr=log_reg.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(y_pred_lr, y_test)*100)
cv_scores =cross_val_score(log_reg, Train_X_Tfidf, y_train, cv=5)
cv_scores

 Accuracy Score ->  75.0


array([0.73571429, 0.71428571, 0.71428571, 0.71428571, 0.70714286])

In [146]:
# Sort the coefficients of features in ascending and desceding order. 
# Extract the indexes of these features into inds_ascending and inds_descending
inds_ascending = np.argsort(log_reg.coef_.flatten()) 
inds_descending = inds_ascending[::-1]

In [147]:
# find the words corresponding to lowest coefficients
for i in range(10):
    for key, value in Tfidf_vect.vocabulary_.items():
        if value==inds_ascending[i]:
            print(key)

bad
waste
bore
plot
poor
write
hard
would
say
book


In [153]:
# find the words corresponding to highest coefficients
for i in range(10):
    for key, value in Tfidf_vect.vocabulary_.items():
        if value==inds_descending[i]:
            print(key)

love
great
best
excellent
wonderful
easy
good
henry
game
recommend


### Decision tree, Random Forest

In [187]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_features='auto')
tree_clf.fit(Train_X_Tfidf,y_train)

y_pred=tree_clf.predict(Test_X_Tfidf)
print("Accuracy Score -> ",accuracy_score(y_pred, y_test)*100)

cv_scores =cross_val_score(tree_clf, Train_X_Tfidf, y_train, cv=5)
cv_scores, cv_scores.mean()

Accuracy Score ->  63.66666666666667


(array([0.67142857, 0.67142857, 0.6       , 0.51428571, 0.75      ]),
 0.6414285714285713)

In [182]:
tree_clf.get_depth(),tree_clf.get_n_leaves(), tree_clf.n_features_

(87, 274, 5000)

In [183]:
inds_ascending=np.argsort(tree_clf.feature_importances_.flatten())
inds_descending = inds_ascending[::-1]

In [184]:
# find the words corresponding to lowest coefficients
for i in range(10):
    for key, value in Tfidf_vect.vocabulary_.items():
        if value==inds_ascending[i]:
            print(key)

aa
rising
rise
ripple
rip
ringolsby
ringer
ring
rigid
right


In [185]:
# find the words corresponding to highest coefficients
for i in range(20):
    for key, value in Tfidf_vect.vocabulary_.items():
        if value==inds_descending[i]:
            print(key)

love
say
come
like
would
book
make
good
disappoint
money
read
know
daughter
guess
must
never
world
write
something
hazlitt


In [156]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(Train_X_Tfidf,y_train)
y_pred=rf.predict(Test_X_Tfidf)
print("Accuracy Score -> ",accuracy_score(y_pred, y_test)*100)
cv_scores =cross_val_score(rf, Train_X_Tfidf, y_train, cv=5)
cv_scores

Accuracy Score ->  76.33333333333333


array([0.75      , 0.78571429, 0.72142857, 0.77857143, 0.77857143])

### Naive Bays

In [157]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(Train_X_Tfidf.toarray(),y_train)

y_pred=gnb.predict(Test_X_Tfidf.toarray())
print("Accuracy Score -> ",accuracy_score(y_pred, y_test)*100)

cv_scores =cross_val_score(gnb, Train_X_Tfidf.toarray(), y_train, cv=5)
cv_scores

Accuracy Score ->  60.66666666666667


array([0.64285714, 0.64285714, 0.63571429, 0.67142857, 0.67857143])

### KNN

### String kernels? 

### Parameter tuning for SVC

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = dict(kernel = [‘linear’, ‘rbf’, ‘poly’], gamma = [0.1, 1, 10, 100], c = [0.1, 1, 10, 100, 1000], 
                  degree = [0, 1, 2, 3, 4, 5, 6])


non.fit(Train_X_Tfidf,y_train)
predictions_non = non.predict(Test_X_Tfidf)
print(" Accuracy Score -> ",accuracy_score(predictions_non, y_test)*100)
cv_scores =cross_val_score(non, Train_X_Tfidf, y_train, cv=5)
cv_scores

svm=  svm.SVC()
svm_cv = GridSearchCV(svm, param_grid, cv=5)
svm_cv.fit(X, y)
svm_cv.best_params_
svm_cv.best_score

_