<a href="https://colab.research.google.com/github/maryam-sharafi/word-embedding-/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
messages = pd.read_csv('imdb_labelled.txt', sep='\t',
                       names=["messages","label"])

In [None]:
messages

Unnamed: 0,messages,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [None]:
#data cleaning and preprocessing
import nltk
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
ps.stem('hiking')

'hike'

In [None]:
#apply stopwords and stemming
corpus = []
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]',' ',messages['messages'][i])
  review = review.lower().split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
corpus

['slow move aimless movi distress drift young man',
 'sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost non exist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act post product edit direct aspect film make',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema think film so

In [None]:
#creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, binary=True)
x = cv.fit_transform(corpus).toarray()

In [None]:
x.shape

(748, 2000)

In [None]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [None]:
y

array([False, False, False, False,  True, False, False,  True, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True, False, False, False, False, False,
        True, False, False,  True,  True, False, False, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [None]:
X_train,y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([False,  True,  True,  True,  True,  True, False,  True,  True,
         True, False, False,  True, False,  True, False,  True, False,
         True, False,  True, False,  True, False,  True,  True,  True,
        False, False, False, False,  True, False,  True,  True, False,
         True, False, False, False, False, False, False, False, False,
         True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True, False,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True, False, False,
        False, False,  True,  True,  True, False,  True,  True, False,
        False,  True, False,  True,  True, False,  True, False, False,
        False, False, False,  True,  True,  True,  True,  True,  True,
      

In [None]:
from sklearn.naive_bayes import MultinomialNB
sentiment_detect_model = MultinomialNB().fit(X_train, y_train)

In [None]:
#prediction
y_pred = sentiment_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report , accuracy_score
score = accuracy_score(y_test,y_pred)
print(classification_report(y_pred,y_test))
print(score)

              precision    recall  f1-score   support

       False       0.77      0.83      0.80        78
        True       0.80      0.74      0.77        72

    accuracy                           0.79       150
   macro avg       0.79      0.78      0.79       150
weighted avg       0.79      0.79      0.79       150

0.7866666666666666


In [None]:
#creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2000)
X = tv.fit_transform(corpus).toarray()

In [None]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
sentiment_detect_model = MultinomialNB().fit(X_train, y_train)

In [None]:
#prediction
y_pred = sentiment_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report , accuracy_score
score = accuracy_score(y_test,y_pred)
print(classification_report(y_pred,y_test))
print(score)

              precision    recall  f1-score   support

       False       0.71      0.88      0.79        68
        True       0.88      0.71      0.78        82

    accuracy                           0.79       150
   macro avg       0.80      0.79      0.79       150
weighted avg       0.80      0.79      0.79       150

0.7866666666666666


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_pred,y_test))

0.66
              precision    recall  f1-score   support

       False       0.45      0.88      0.60        43
        True       0.92      0.57      0.71       107

    accuracy                           0.66       150
   macro avg       0.69      0.73      0.65       150
weighted avg       0.79      0.66      0.67       150



word2vec implementation

In [None]:
!pip install gensim



In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [None]:
vec_king = wv['king']
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
corpus=[]
for i in range(0,len(messages)):
  review = re.sub('[^a-zA-Z]',' ',messages['messages'][i])
  review = review.lower().split()

  review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
##simple_preprocess:convert document into list lowercase
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [None]:
words

[['slow',
  'moving',
  'aimless',
  'movie',
  'distressed',
  'drifting',
  'young',
  'man'],
 ['sure', 'lost', 'flat', 'character', 'audience', 'nearly', 'half', 'walked'],
 ['attempting',
  'artiness',
  'black',
  'white',
  'clever',
  'camera',
  'angle',
  'movie',
  'disappointed',
  'became',
  'even',
  'ridiculous',
  'acting',
  'poor',
  'plot',
  'line',
  'almost',
  'non',
  'existent'],
 ['little', 'music', 'anything', 'speak'],
 ['best',
  'scene',
  'movie',
  'gerardo',
  'trying',
  'find',
  'song',
  'keep',
  'running',
  'head'],
 ['rest',
  'movie',
  'lack',
  'art',
  'charm',
  'meaning',
  'emptiness',
  'work',
  'guess',
  'empty'],
 ['wasted', 'two', 'hour'],
 ['saw',
  'movie',
  'today',
  'thought',
  'good',
  'effort',
  'good',
  'message',
  'kid'],
 ['bit', 'predictable'],
 ['loved', 'casting', 'jimmy', 'buffet', 'science', 'teacher'],
 ['baby', 'owl', 'adorable'],
 ['movie', 'showed', 'lot', 'florida', 'best', 'made', 'look', 'appealing'],
 [

In [None]:
import gensim

In [None]:
## train word2vwc from scratch
model= gensim.models.Word2Vec(words,window=5, min_count=2)

In [None]:
model.wv.index_to_key

['movie',
 'film',
 'one',
 'bad',
 'character',
 'good',
 'like',
 'time',
 'acting',
 'really',
 'great',
 'even',
 'see',
 'well',
 'scene',
 'story',
 'make',
 'ever',
 'actor',
 'plot',
 'made',
 'script',
 'also',
 'work',
 'way',
 'best',
 'thing',
 'would',
 'seen',
 'love',
 'watching',
 'look',
 'think',
 'could',
 'real',
 'much',
 'get',
 'show',
 'every',
 'line',
 'year',
 'funny',
 'wonderful',
 'better',
 'cast',
 'never',
 'performance',
 'give',
 'watch',
 'little',
 'go',
 'everything',
 'excellent',
 'part',
 'know',
 'anyone',
 'totally',
 'thought',
 'nothing',
 'say',
 'awful',
 'people',
 'still',
 'art',
 'director',
 'many',
 'stupid',
 'waste',
 'music',
 'life',
 'screen',
 'writing',
 'pretty',
 'worth',
 'dialogue',
 'right',
 'man',
 'recommend',
 'two',
 'lot',
 'terrible',
 'minute',
 'play',
 'interesting',
 'saw',
 'piece',
 'end',
 'game',
 'first',
 'worse',
 'though',
 'enough',
 'short',
 'find',
 'worst',
 'job',
 'feeling',
 'beautiful',
 'endin

In [None]:
model.corpus_count

746

In [None]:
model.epochs

5

In [None]:
model.wv.similar_by_word('movie')

[('film', 0.72734534740448),
 ('really', 0.6785823702812195),
 ('make', 0.6718945503234863),
 ('character', 0.670469343662262),
 ('one', 0.6651437878608704),
 ('watching', 0.6599121689796448),
 ('even', 0.650736927986145),
 ('like', 0.6369838118553162),
 ('good', 0.6299586892127991),
 ('real', 0.6283659934997559)]

In [None]:
model.wv['movie'].shape

(100,)

In [None]:
def avg_word2vec(doc):
  # remove out-of-vocabulary words
  #sent = [word for word in doc if word in model.wv.index_to_key]
  #print(sent)
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
  #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
words[1]

['sure', 'lost', 'flat', 'character', 'audience', 'nearly', 'half', 'walked']

In [None]:
type(model.wv.index_to_key)

list

In [None]:
import numpy as np

In [None]:
#apply for entire sentences
x=[]
for i in tqdm(range(len(words))):
  print("Hello",i)
  x.append(avg_word2vec(words[i]))

 14%|█▍        | 106/746 [00:00<00:00, 1023.31it/s]

Hello 0
Hello 1
Hello 2
Hello 3
Hello 4
Hello 5
Hello 6
Hello 7
Hello 8
Hello 9
Hello 10
Hello 11
Hello 12
Hello 13
Hello 14
Hello 15
Hello 16
Hello 17
Hello 18
Hello 19
Hello 20
Hello 21
Hello 22
Hello 23
Hello 24
Hello 25
Hello 26
Hello 27
Hello 28
Hello 29
Hello 30
Hello 31
Hello 32
Hello 33
Hello 34
Hello 35
Hello 36
Hello 37
Hello 38
Hello 39
Hello 40
Hello 41
Hello 42
Hello 43
Hello 44
Hello 45
Hello 46
Hello 47
Hello 48
Hello 49
Hello 50
Hello 51
Hello 52
Hello 53
Hello 54
Hello 55
Hello 56
Hello 57
Hello 58
Hello 59
Hello 60
Hello 61
Hello 62
Hello 63
Hello 64
Hello 65
Hello 66
Hello 67
Hello 68
Hello 69
Hello 70
Hello 71
Hello 72
Hello 73
Hello 74
Hello 75
Hello 76
Hello 77
Hello 78
Hello 79
Hello 80
Hello 81
Hello 82
Hello 83
Hello 84
Hello 85
Hello 86
Hello 87
Hello 88
Hello 89
Hello 90
Hello 91
Hello 92
Hello 93
Hello 94
Hello 95
Hello 96
Hello 97
Hello 98
Hello 99
Hello 100
Hello 101
Hello 102
Hello 103
Hello 104
Hello 105
Hello 106
Hello 107
Hello 108
Hello 109
Hello 110


 42%|████▏     | 317/746 [00:00<00:00, 1032.89it/s]

 200
Hello 201
Hello 202
Hello 203
Hello 204
Hello 205
Hello 206
Hello 207
Hello 208
Hello 209
Hello 210
Hello 211
Hello 212
Hello 213
Hello 214
Hello 215
Hello 216
Hello 217
Hello 218
Hello 219
Hello 220
Hello 221
Hello 222
Hello 223
Hello 224
Hello 225
Hello 226
Hello 227
Hello 228
Hello 229
Hello 230
Hello 231
Hello 232
Hello 233
Hello 234
Hello 235
Hello 236
Hello 237
Hello 238
Hello 239
Hello 240
Hello 241
Hello 242
Hello 243
Hello 244
Hello 245
Hello 246
Hello 247
Hello 248
Hello 249
Hello 250
Hello 251
Hello 252
Hello 253
Hello 254
Hello 255
Hello 256
Hello 257
Hello 258
Hello 259
Hello 260
Hello 261
Hello 262
Hello 263
Hello 264
Hello 265
Hello 266
Hello 267
Hello 268
Hello 269
Hello 270
Hello 271
Hello 272
Hello 273
Hello 274
Hello 275
Hello 276
Hello 277
Hello 278
Hello 279
Hello 280
Hello 281
Hello 282
Hello 283
Hello 284
Hello 285
Hello 286
Hello 287
Hello 288
Hello 289
Hello 290
Hello 291
Hello 292
Hello 293
Hello 294
Hello 295
Hello 296
Hello 297
Hello 298
Hello 299
Hello

 72%|███████▏  | 534/746 [00:00<00:00, 1043.56it/s]

Hello 422
Hello 423
Hello 424
Hello 425
Hello 426
Hello 427
Hello 428
Hello 429
Hello 430
Hello 431
Hello 432
Hello 433
Hello 434
Hello 435
Hello 436
Hello 437
Hello 438
Hello 439
Hello 440
Hello 441
Hello 442
Hello 443
Hello 444
Hello 445
Hello 446
Hello 447
Hello 448
Hello 449
Hello 450
Hello 451
Hello 452
Hello 453
Hello 454
Hello 455
Hello 456
Hello 457
Hello 458
Hello 459
Hello 460
Hello 461
Hello 462
Hello 463
Hello 464
Hello 465
Hello 466
Hello 467
Hello 468
Hello 469
Hello 470
Hello 471
Hello 472
Hello 473
Hello 474
Hello 475
Hello 476
Hello 477
Hello 478
Hello 479
Hello 480
Hello 481
Hello 482
Hello 483
Hello 484
Hello 485
Hello 486
Hello 487
Hello 488
Hello 489
Hello 490
Hello 491
Hello 492
Hello 493
Hello 494
Hello 495
Hello 496
Hello 497
Hello 498
Hello 499
Hello 500
Hello 501
Hello 502
Hello 503
Hello 504
Hello 505
Hello 506
Hello 507
Hello 508
Hello 509
Hello 510
Hello 511
Hello 512
Hello 513
Hello 514
Hello 515
Hello 516
Hello 517
Hello 518
Hello 519
Hello 520
Hello 521


100%|██████████| 746/746 [00:00<00:00, 1031.56it/s]

Hello 627
Hello 628
Hello 629
Hello 630
Hello 631
Hello 632
Hello 633
Hello 634
Hello 635
Hello 636
Hello 637
Hello 638
Hello 639
Hello 640
Hello 641
Hello 642
Hello 643
Hello 644
Hello 645
Hello 646
Hello 647
Hello 648
Hello 649
Hello 650
Hello 651
Hello 652
Hello 653
Hello 654
Hello 655
Hello 656
Hello 657
Hello 658
Hello 659
Hello 660
Hello 661
Hello 662
Hello 663
Hello 664
Hello 665
Hello 666
Hello 667
Hello 668
Hello 669
Hello 670
Hello 671
Hello 672
Hello 673
Hello 674
Hello 675
Hello 676
Hello 677
Hello 678
Hello 679
Hello 680
Hello 681
Hello 682
Hello 683
Hello 684
Hello 685
Hello 686
Hello 687
Hello 688
Hello 689
Hello 690
Hello 691
Hello 692
Hello 693
Hello 694
Hello 695
Hello 696
Hello 697
Hello 698
Hello 699
Hello 700
Hello 701
Hello 702
Hello 703
Hello 704
Hello 705
Hello 706
Hello 707
Hello 708
Hello 709
Hello 710
Hello 711
Hello 712
Hello 713
Hello 714
Hello 715
Hello 716
Hello 717
Hello 718
Hello 719
Hello 720
Hello 721
Hello 722
Hello 723
Hello 724
Hello 725
Hello 726





In [None]:
type(X)

numpy.ndarray

In [None]:
X_new=np.array(X)

In [None]:
X_new[3]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
X_new.shape

(748, 2000)