# Natural Language Processing

## Importing the libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/Dataset/nlp/Restaurant_Reviews.tsv',delimiter = '\t',quoting = 3)
dataset


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


## Cleaning the texts

In [4]:
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# used for stemming ( Changing ) the word, into a different word so that it makes more sense. 
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0,1000):

  # replacing all the punctuation with space 
  review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])

  #print(review)
  
  # Whole list in the lower case
  review = review.lower()

  #print(review)

  # Splitting each review in list of words
  review = review.split()

  #print(review)

  ps = PorterStemmer()

  # usually stopwords contains 'not' word
  # So if a statement contains not it will lose its relevance because of stemming
  # So for 'not' to not lose its relevance --> we need to add some extra code here


  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  # Stemming the words and storing it into the list
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]

  #print(review)
  review = ' '.join(review)

  #print(review)

  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
corpus

## Creating the Bag of Words model

In [6]:
# Tokenization is done using sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 1500)
X = cv.fit_transform(corpus).toarray()

# pandas library
y = dataset.iloc[:,-1].values



In [7]:
# There are 1566 words resulting from tokenization
len(X[0])

1500

## Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Decision Tree model on the Training set

In [9]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state= 0)
classifier.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate( (y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

## Making the Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[78 19]
 [31 72]]


0.75