# Naturel Language Processing (NLP)

In [58]:
import pandas as pd
import numpy as np
import scipy as sp
import nltk 
from nltk.tokenize import sent_tokenize #paragraftan cümleleri ayırıp liste haline getirir.
import warnings
warnings.filterwarnings("ignore")


In [59]:
!pip install nltk


[0m

In [60]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mehmetburakerkan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
text = "Welcome readers. I hope you find interesting. Please do reply."
print(sent_tokenize(text))

['Welcome readers.', 'I hope you find interesting.', 'Please do reply.']


In [62]:
# word tokenize 
text2=nltk.word_tokenize("PierreVinken,59 years old,will join as a nonexecutive director on Nov. 29 .")
print(text2)

['PierreVinken,59', 'years', 'old', ',', 'will', 'join', 'as', 'a', 'nonexecutive', 'director', 'on', 'Nov.', '29', '.']


In [63]:
# türkçe
WPT = nltk.WordPunctTokenizer()
nltk.download("stopwords")
stop_word_list = nltk.corpus.stopwords.words("turkish")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehmetburakerkan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stemmer and Lemmatizer

# Porter Stemmer

In [64]:
from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()
stemmerporter.stem("happiness")

'happi'

# Lancaster Stemmer

In [65]:
from nltk.stem import LancasterStemmer
stemmerlan = LancasterStemmer()
stemmerlan.stem("happiness")

'happy'

# Regexp Stemmer

In [66]:
from nltk.stem import RegexpStemmer
stemmerregexp = RegexpStemmer("ing")
print(stemmerregexp.stem("working"))
print(stemmerregexp.stem("happiness"))
print(stemmerregexp.stem("pairing"))

work
happiness
pair


# Snowball Stemmer

In [67]:
from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)
spanishstemmer = SnowballStemmer("spanish")
print(spanishstemmer.stem("comiende"))

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
comiend


In [68]:
frenchstemmer = SnowballStemmer("french")
print(frenchstemmer.stem("manager"))

manag


In [69]:
#!pip install TurkishStemmer
from TurkishStemmer import TurkishStemmer
turkishstemmer = TurkishStemmer()
turkishstemmer.stem("geleceğim")

'gelecek'

# Lemmatization

In [70]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mehmetburakerkan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# POS-Part of Speech- isim,fiil,subject,proposition...

In [71]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer_output = WordNetLemmatizer()
print(lemmatizer_output.lemmatize("working"))

# kelimenin fiil olduğunu soyluyoruz
print(lemmatizer_output.lemmatize("working",pos="v"))
print(lemmatizer_output.lemmatize("works"))


working
work
work


# Difference Between Stemming and Lemmatization

In [72]:
from nltk.stem import PorterStemmer
stemmer_output = PorterStemmer()
print(stemmer_output.stem("happiness"))
stemmer_output.stem("Mens")

happi


'men'

In [73]:
from nltk.stem import WordNetLemmatizer
lemmatizer_output = WordNetLemmatizer()
lemmatizer_output.lemmatize("Mens")

'Mens'

# POS Tagging and POS Tagger

In [74]:
from nltk.tokenize import word_tokenize
text = word_tokenize("Drive into NLTK: Part-of-speech tagging and POS Tagger")
text

['Drive',
 'into',
 'NLTK',
 ':',
 'Part-of-speech',
 'tagging',
 'and',
 'POS',
 'Tagger']

In [75]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mehmetburakerkan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [76]:
nltk.pos_tag(text)

[('Drive', 'NNP'),
 ('into', 'IN'),
 ('NLTK', 'NNP'),
 (':', ':'),
 ('Part-of-speech', 'JJ'),
 ('tagging', 'NN'),
 ('and', 'CC'),
 ('POS', 'NNP'),
 ('Tagger', 'NNP')]

In [77]:
import nltk
from nltk.tag import DefaultTagger
tag = DefaultTagger("He is the man")
tag.tag(["Beautiful","morning"])

[('Beautiful', 'He is the man'), ('morning', 'He is the man')]

In [78]:
#!pip install autocorrect

In [79]:
from autocorrect import spell
spell("Tghe")

autocorrect.spell is deprecated,             use autocorrect.Speller instead


'The'

In [80]:
!pip install textblob

[0m

# TextBlob

In [81]:
from textblob import TextBlob
b = TextBlob("I havv good speling!")
#print(b.detect_language())
print(b.correct())

I have good spelling!


In [82]:
from textblob import Word
w = Word("falability")
w.spellcheck()

[('fallibility', 0.3333333333333333),
 ('capability', 0.3333333333333333),
 ('affability', 0.3333333333333333)]

In [83]:
#!pip install langdetect

In [84]:
from langdetect import detect
print(detect("War doesn't show who's right , just who's left"))
print(detect("Ein, zwei, drei, vier"))
print(detect("Eu gosto de mulher"))
print(detect("Nasılsınız?"))

en
de
pt
tr


In [85]:
en_blob = TextBlob(u'I am a free black man loved by Jesus Christ.')
en_blob.translate(to="ru")

AttributeError: 'list' object has no attribute 'strip'

In [86]:
#pip install -U textblob

In [87]:
tr_blob = TextBlob(u'Benim çok işim var.')
tr_blob.tags
#print(tr_blob.translate(to="ru"))

[('Benim', 'NNP'), ('çok', 'NNP'), ('işim', 'NN'), ('var', 'NN')]

# TFIDF

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
simple_train = ["Call you tonight","Call me a cab"," please call me...PLEASE"]

vect = CountVectorizer()

tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())

tf





Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [2]:
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df.reshape(1,6),columns=vect.get_feature_names())



Unnamed: 0,cab,call,me,please,tonight,you
0,1,3,2,1,1,1


In [None]:
TfidfVectorizer()

In [90]:
tf/df

Unnamed: 0,cab,call,me,please,tonight,you
0,0.0,0.333333,0.0,0.0,1.0,1.0
1,1.0,0.333333,0.5,0.0,0.0,0.0
2,0.0,0.333333,0.5,2.0,0.0,0.0


In [91]:
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(),columns=vect.get_feature_names())


Unnamed: 0,cab,call,me,please,tonight,you
0,0.0,0.385372,0.0,0.0,0.652491,0.652491
1,0.720333,0.425441,0.547832,0.0,0.0,0.0
2,0.0,0.266075,0.34262,0.901008,0.0,0.0


# CountVectorizer - Fit Transform with NLP

In [92]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
print(vectorizer)

corpus = ["This is the first document","This is the second document","And the third one","Is this the first document?"]

X = vectorizer.fit_transform(corpus)

tf = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())

tf

CountVectorizer()


Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,1,0,1,0,1,1,0,1
2,1,0,0,0,1,0,1,1,0
3,0,1,1,1,0,0,1,0,1


In [93]:
print(X)

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	1
  (1, 5)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [2]:
yelp = pd.read_csv("yelp.csv")
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [96]:
yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


- Bütün cümleler küçük harfe çevrilecek
- Noktalama işaretlerini kaldır
- Rakamları kaldır
- Satır sonu , \n \r
- Stop words leri kaldır(gereksiz kelimeler)
- Tokenize et
- Lemma Stemma (ekleri kaldır, kökleri bul)
- Vetorizer ile yazıları rakama atıyoruz

In [3]:
yelp["text"]=yelp["text"].str.lower()

In [4]:
yelp["text"]=yelp["text"].str.replace("[^\w\s]","")

  yelp["text"]=yelp["text"].str.replace("[^\w\s]","")


In [5]:
yelp["text"]=yelp["text"].str.replace("\d+","")

  yelp["text"]=yelp["text"].str.replace("\d+","")


In [6]:
yelp["text"]=yelp["text"].str.replace("\n"," ").replace("\r","")

In [7]:
yelp_best_worst = yelp[(yelp.stars==5)|(yelp.stars==1)]

yelp_best_worst.reset_index(drop=True,inplace=True)

x = yelp_best_worst.text
y = yelp_best_worst.stars

print(x.shape)

x_train,  x_test, y_train, y_test = train_test_split(x,y, random_state=1)



(4086,)


In [8]:
print(x)

0       my wife took me here on my birthday for breakf...
1       i have no idea why some people give bad review...
2       rosie dakota and i love chaparral dog park its...
3       general manager scott petello is a good egg no...
4       drop what youre doing and drive here after i a...
                              ...                        
4081    yes i do rock the hipster joints  i dig this p...
4082    only  stars   a few notes the folks that rated...
4083    im not normally one to jump at reviewing a cha...
4084    lets seewhat is there not to like about surpri...
4085     locations all  star average i think arizona r...
Name: text, Length: 4086, dtype: object


# Tokenization

In [9]:
vect = CountVectorizer(stop_words="english")

x_train_dtm = vect.fit_transform(x_train)
print(x_train_dtm)

x_test_dtm = vect.transform(x_test)

  (0, 5877)	1
  (0, 13267)	1
  (0, 17839)	2
  (0, 10102)	1
  (0, 11893)	1
  (0, 7711)	1
  (0, 4864)	1
  (0, 17363)	1
  (0, 13323)	2
  (0, 2631)	1
  (0, 15871)	1
  (0, 3608)	1
  (0, 727)	1
  (0, 2089)	2
  (0, 6748)	4
  (0, 17026)	1
  (0, 9348)	1
  (0, 3141)	1
  (0, 1315)	1
  (0, 2087)	1
  (0, 15732)	1
  (0, 13295)	1
  (0, 1281)	1
  (0, 3119)	1
  (0, 14368)	1
  :	:
  (3062, 6800)	1
  (3062, 7011)	1
  (3062, 9129)	1
  (3063, 15732)	1
  (3063, 12811)	1
  (3063, 32)	1
  (3063, 9942)	1
  (3063, 6129)	2
  (3063, 9804)	1
  (3063, 8368)	1
  (3063, 8588)	2
  (3063, 7441)	1
  (3063, 2661)	1
  (3063, 2571)	1
  (3063, 15869)	1
  (3063, 6928)	1
  (3063, 17828)	1
  (3063, 7211)	1
  (3063, 15084)	1
  (3063, 5206)	1
  (3063, 4557)	1
  (3063, 11483)	1
  (3063, 16046)	1
  (3063, 17322)	1
  (3063, 6806)	1


In [10]:
type(x_train_dtm)

scipy.sparse.csr.csr_matrix

In [11]:
f = x_train_dtm.toarray()

In [13]:
type(f)

numpy.ndarray

In [104]:
print(x_test)

1607    looking a cutting edge wanting the best for ev...
3409    greatness in the form of food just like the ot...
1751    the flower studio far exceeded my expectations...
2275         so yummy strange combination but great place
230     ive been hearing about these cheesecakes from ...
                              ...                        
2793    honey jalapeño chicken lollipops and sweet pot...
671                      probably my favorite restaurant 
3441    a philosophical elder of my profession commonl...
3224    first im sorry this review is lengthy but i re...
3362    you speak italian to me and provide mouth wate...
Name: text, Length: 1022, dtype: object


In [105]:
tf = pd.DataFrame(x_train_dtm.toarray(), columns=vect.get_feature_names())
tf.head()

Unnamed: 0,______,_______________,_c,aa,aaa,aaaamazing,aaammmazzing,aaron,ab,abandoned,...,zucca,zucchini,zuchinni,zumba,zupa,zuzu,zwiebelkräuter,éclairs,école,ém
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
x_train.head()

2790    fillybs  only  reviews nine now  wow do i miss...
725     my husband and i absolutely love this restaura...
1578    we went today after lunch i got my usual of li...
282     totally dissapointed  i had purchased a coupon...
2024    costco travel  my husband and i recently retur...
Name: text, dtype: object

In [14]:
vect = CountVectorizer(ngram_range=(1,2))
x_train_dtm = vect.fit_transform(x_train)
x_train_dtm.shape

(3064, 171153)

In [15]:
x_train_dtm

<3064x171153 sparse matrix of type '<class 'numpy.int64'>'
	with 568619 stored elements in Compressed Sparse Row format>

In [108]:
print(vect.get_feature_names()[-50:])

['zone', 'zone of', 'zone out', 'zone when', 'zones', 'zones dolls', 'zoning', 'zoning issues', 'zoo', 'zoo and', 'zoo is', 'zoo ive', 'zoo not', 'zoo the', 'zoyo', 'zoyo for', 'zucca', 'zucca appetizer', 'zucchini', 'zucchini and', 'zucchini bread', 'zucchini broccoli', 'zucchini carrots', 'zucchini fries', 'zucchini pieces', 'zucchini strips', 'zucchini veal', 'zucchini very', 'zucchini with', 'zuchinni', 'zuchinni again', 'zuchinni the', 'zumba', 'zumba class', 'zumba or', 'zumba yogalates', 'zupa', 'zupa flavors', 'zuzu', 'zuzu in', 'zuzu is', 'zuzu the', 'zwiebelkräuter', 'zwiebelkräuter salat', 'éclairs', 'éclairs napoleons', 'école', 'école lenôtre', 'ém', 'ém all']


# Predict the Star rating

In [109]:
vect = CountVectorizer()

x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)
nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)
y_pred_class = nb.predict(x_test_dtm)

print(metrics.accuracy_score(y_test, y_pred_class))

0.9168297455968689


# Calculate Null Accuracy

In [110]:
def tokenize_test(vect):
    x_train_dtm = vect.fit_transform(x_train)
    print("Features: ",x_train_dtm.shape[1])
    x_test_dtm = vect.transform(x_test)
    nb = MultinomialNB()
    nb.fit(x_train_dtm, y_train)
    y_pred_class = nb.predict(x_test_dtm)
    print("Accuracy: ",metrics.accuracy_score(y_test, y_pred_class))

In [111]:
vect = CountVectorizer()
tokenize_test(vect)

Features:  18379
Accuracy:  0.9168297455968689


In [112]:
vect = CountVectorizer(ngram_range=(1,2))
tokenize_test(vect)

Features:  171153
Accuracy:  0.8522504892367906


# Stopword Removal

In [113]:
my_test_corpus = "I am a Sidy. A free black man. Believe it or not; it is true"

In [114]:
vect = CountVectorizer(stop_words="english")
tokenize_test(vect)

Features:  18081
Accuracy:  0.9168297455968689


In [115]:
print(vect.get_stop_words())

frozenset({'beforehand', 'further', 'move', 'amoungst', 'cant', 'any', 'one', 'over', 'up', 'anywhere', 'alone', 'cannot', 'con', 'put', 'made', 'together', 'no', 'fire', 'its', 'thereupon', 'ourselves', 'find', 'once', 'during', 'the', 'might', 'whence', 'name', 'cry', 'hereupon', 'and', 'can', 'hereafter', 'beyond', 'most', 'nothing', 'eleven', 'hence', 'show', 'serious', 'call', 'before', 'was', 'nine', 'on', 'would', 'being', 'both', 'meanwhile', 'should', 'those', 'down', 'except', 'de', 'a', 'whither', 'indeed', 'moreover', 'there', 'will', 'her', 'behind', 'everything', 'whereby', 'thru', 'himself', 'becomes', 'detail', 'among', 'go', 'beside', 'another', 'mine', 'front', 'not', 'toward', 'thick', 'they', 'full', 'couldnt', 'herself', 'you', 'two', 'with', 'get', 'though', 'sometimes', 'of', 'too', 'to', 'again', 'due', 'from', 'ltd', 'last', 'least', 'thereby', 'each', 're', 'thus', 'by', 'through', 'inc', 'done', 'be', 'their', 'under', 'same', 'at', 'few', 'some', 'noone', 'w

In [116]:
vect = CountVectorizer(ngram_range=(1,2),max_features=100000)
tokenize_test(vect)

Features:  100000
Accuracy:  0.8816046966731899


In [117]:
vect = CountVectorizer(ngram_range=(1,2),min_df=2)
tokenize_test(vect)

Features:  43256
Accuracy:  0.9305283757338552


# TextBlob

In [118]:
print(yelp_best_worst.text[0])

my wife took me here on my birthday for breakfast and it was excellent  the weather was perfect which made sitting outside overlooking their grounds an absolute pleasure  our waitress was excellent and our food arrived quickly on the semibusy saturday morning  it looked like the place fills up pretty quickly so the earlier you get here the better  do yourself a favor and get their bloody mary  it was phenomenal and simply the best ive ever had  im pretty sure they only use ingredients from their garden and blend them fresh when you order it  it was amazing  while everything on the menu looks excellent i had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious  it came with  pieces of their griddled bread with was amazing and it absolutely made the meal complete  it was the best toast ive ever had  anyway i cant wait to go back


In [119]:
review = TextBlob(yelp_best_worst.text[0])

In [120]:
review.words

WordList(['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excellent', 'the', 'weather', 'was', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'grounds', 'an', 'absolute', 'pleasure', 'our', 'waitress', 'was', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semibusy', 'saturday', 'morning', 'it', 'looked', 'like', 'the', 'place', 'fills', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloody', 'mary', 'it', 'was', 'phenomenal', 'and', 'simply', 'the', 'best', 'ive', 'ever', 'had', 'im', 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'was', 'amazing', 'while', 'everything', 'on', 'the', 'menu', 'looks', 'excellent', 'i', 'had', 'the', 'white', 'truffle', 'scrambled', 'eggs', 'vegetable'

In [121]:
review.sentences

[Sentence("my wife took me here on my birthday for breakfast and it was excellent  the weather was perfect which made sitting outside overlooking their grounds an absolute pleasure  our waitress was excellent and our food arrived quickly on the semibusy saturday morning  it looked like the place fills up pretty quickly so the earlier you get here the better  do yourself a favor and get their bloody mary  it was phenomenal and simply the best ive ever had  im pretty sure they only use ingredients from their garden and blend them fresh when you order it  it was amazing  while everything on the menu looks excellent i had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious  it came with  pieces of their griddled bread with was amazing and it absolutely made the meal complete  it was the best toast ive ever had  anyway i cant wait to go back")]

In [122]:
review.lower()

TextBlob("my wife took me here on my birthday for breakfast and it was excellent  the weather was perfect which made sitting outside overlooking their grounds an absolute pleasure  our waitress was excellent and our food arrived quickly on the semibusy saturday morning  it looked like the place fills up pretty quickly so the earlier you get here the better  do yourself a favor and get their bloody mary  it was phenomenal and simply the best ive ever had  im pretty sure they only use ingredients from their garden and blend them fresh when you order it  it was amazing  while everything on the menu looks excellent i had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious  it came with  pieces of their griddled bread with was amazing and it absolutely made the meal complete  it was the best toast ive ever had  anyway i cant wait to go back")

# Steming and Lemmatization

In [123]:
stemmer = SnowballStemmer("english")
print([stemmer.stem(word) for word in review.words])

['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excel', 'the', 'weather', 'was', 'perfect', 'which', 'made', 'sit', 'outsid', 'overlook', 'their', 'ground', 'an', 'absolut', 'pleasur', 'our', 'waitress', 'was', 'excel', 'and', 'our', 'food', 'arriv', 'quick', 'on', 'the', 'semibusi', 'saturday', 'morn', 'it', 'look', 'like', 'the', 'place', 'fill', 'up', 'pretti', 'quick', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloodi', 'mari', 'it', 'was', 'phenomen', 'and', 'simpli', 'the', 'best', 'ive', 'ever', 'had', 'im', 'pretti', 'sure', 'they', 'onli', 'use', 'ingredi', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'was', 'amaz', 'while', 'everyth', 'on', 'the', 'menu', 'look', 'excel', 'i', 'had', 'the', 'white', 'truffl', 'scrambl', 'egg', 'veget', 'skillet', 'and', 'it', 'was', 'tasti', 'and', 'delici', 'it', 

In [124]:
print([word.lemmatize() for word in review.words])

['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'wa', 'excellent', 'the', 'weather', 'wa', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'ground', 'an', 'absolute', 'pleasure', 'our', 'waitress', 'wa', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semibusy', 'saturday', 'morning', 'it', 'looked', 'like', 'the', 'place', 'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloody', 'mary', 'it', 'wa', 'phenomenal', 'and', 'simply', 'the', 'best', 'ive', 'ever', 'had', 'im', 'pretty', 'sure', 'they', 'only', 'use', 'ingredient', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'wa', 'amazing', 'while', 'everything', 'on', 'the', 'menu', 'look', 'excellent', 'i', 'had', 'the', 'white', 'truffle', 'scrambled', 'egg', 'vegetable', 'skillet', 'and',

In [125]:
print([word.lemmatize(pos="v") for word in review.words])

['my', 'wife', 'take', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'be', 'excellent', 'the', 'weather', 'be', 'perfect', 'which', 'make', 'sit', 'outside', 'overlook', 'their', 'ground', 'an', 'absolute', 'pleasure', 'our', 'waitress', 'be', 'excellent', 'and', 'our', 'food', 'arrive', 'quickly', 'on', 'the', 'semibusy', 'saturday', 'morning', 'it', 'look', 'like', 'the', 'place', 'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloody', 'mary', 'it', 'be', 'phenomenal', 'and', 'simply', 'the', 'best', 'ive', 'ever', 'have', 'im', 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'be', 'amaze', 'while', 'everything', 'on', 'the', 'menu', 'look', 'excellent', 'i', 'have', 'the', 'white', 'truffle', 'scramble', 'egg', 'vegetable', 'skillet', 'and', 'it', 'be

In [126]:
def split_into_lemmas(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [stemmer.stem(word) for word in words]

In [127]:
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)

Features:  13313
Accuracy:  0.9275929549902152


In [128]:
print(vect.get_feature_names()[-50:])

['yuuuummmma', 'yuuuuummmmmyyi', 'yuuuuuuum', 'yuyuyummi', 'yuzu', 'z', 'zach', 'zam', 'zanella', 'zankou', 'zappo', 'zatsiki', 'zen', 'zenlik', 'zenroad', 'zero', 'zerostar', 'zest', 'zexperi', 'zgrill', 'zha', 'zhou', 'zia', 'zilch', 'zin', 'zinburg', 'zinburgergeist', 'zinc', 'zinfandel', 'zing', 'zip', 'zipcar', 'zipp', 'zipper', 'ziti', 'zoe', 'zombi', 'zone', 'zoo', 'zoyo', 'zucca', 'zucchini', 'zuchinni', 'zumba', 'zupa', 'zuzu', 'zwiebelkräut', 'éclair', 'école', 'ém']


# Using TF-IDF to Summarize a Yelp Review

In [129]:
vect = TfidfVectorizer(stop_words="english")
dtm = vect.fit_transform(yelp.text)
features = vect.get_feature_names()
dtm.shape

(10000, 34834)

In [134]:
import numpy as np
def summarize():
    review_length=0
    while review_length < 300:
        review_id = np.random.randint(0, len(yelp))
        review_text = str(yelp.text[review_id])
        review_length = len(review_text)
        
    word_scores = {}
    for word in TextBlob(review_text).words:
        word = word.lower()
        if word in features:
            word_scores[word] = dtm[review_id, features.index(word)]
            
            
    print("Top Scoring Words: ")
    top_scores = sorted(word_scores.items(),key=lambda x: x[1], reverse=True)[:5]
    for word, score in top_scores:
        print(word)
        
    print("\n"+ review_text)

In [135]:
summarize()

Top Scoring Words: 
olivos
los
mexican
traditional
north

the best example north scottsdale has of traditional phoenix sonoran mexican food  los olivos restaurants have been around forever  at least since the s  the food is great if not fancy and the place is pretty free of gimmicks  you can get your fix of cheese crisps aka quesadillas for you ca transplants deep fried tacos chimichangas combination plates with so much sauce and cheese you cant tell whats underneath but it all works together to taste so good  los olivos still plops down the free chips and salsa right when you sit down so youre good an full when that combo plate comes and glad for it  the atmosphere is more like a nice mexican diner than mexican cantina and service is solid they often have mariachis playing on weekend nights  margaritas arent great but the beer is cold  dont bother with dessert  cheap eats for all the food especially since you wont be hungry for another  hours  if youre in north scottsdale and in the m