In [13]:
import gensim
import pandas as pd

##LOADING THE DATA

In [28]:
data_path = r'C:\Desktop\Full stack Data Science with AI\NLP\NoteBooks\data\unlabeledTrainData.tsv.zip'

In [29]:
df = pd.read_csv(data_path, header=0, delimiter='\t', quoting = 3)

In [30]:
df.shape

(50000, 2)

In [31]:
df.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [32]:
df.to_csv('movie_review.csv')

In [34]:
df.loc[0, 'review']

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

##CLEANING THE DATA

In [35]:
import re, string

In [39]:
def clean_data(str):
    try:
        string = re.sub(r'^https?:\/\/<>.*[\r\n]*','', str, flags = re.MULTILINE)
        string = re.sub(r'[^A-Za-z]', ' ', str)
        words = string.strip().lower().split()
        words = [w for w in words if len(w)>=1]
        return ' '.join(words)
    except:
        return ' '


In [40]:
df['clean_reviews'] = df['review'].apply(clean_data)

In [41]:
df.head(5)

Unnamed: 0,id,review,clean_reviews
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


##CONVERT REVIEW INTO WORDS

In [42]:
documents = []

for doc in df['clean_reviews']:
    documents.append(doc.split(' '))

In [43]:
len(documents)

50000

In [45]:
print(documents[1])

['i', 'saw', 'this', 'film', 'about', 'years', 'ago', 'and', 'remember', 'it', 'as', 'being', 'particularly', 'nasty', 'i', 'believe', 'it', 'is', 'based', 'on', 'a', 'true', 'incident', 'a', 'young', 'man', 'breaks', 'into', 'a', 'nurses', 'home', 'and', 'rapes', 'tortures', 'and', 'kills', 'various', 'women', 'br', 'br', 'it', 'is', 'in', 'black', 'and', 'white', 'but', 'saves', 'the', 'colour', 'for', 'one', 'shocking', 'shot', 'br', 'br', 'at', 'the', 'end', 'the', 'film', 'seems', 'to', 'be', 'trying', 'to', 'make', 'some', 'political', 'statement', 'but', 'it', 'just', 'comes', 'across', 'as', 'confused', 'and', 'obscene', 'br', 'br', 'avoid']


##BUILD THE MODEL

In [None]:
model = gensim.models.Word2Vec(documents, #List of words
                               min_count=10, #ignore all words whose frequency is lower than 10
                               workers=4, #number of CPU cores
                               vector_size=50, #Embedding size
                               window=5, #number of neighbours on the left and right
                               epochs=10 #number of iteration over the text corpus
                               #CORPUS IS A BIG COLLECTION OF TEXT DATA
                               )

In [48]:
#MODEL SIZE

model.wv.vectors.shape   
# wv: word vectors : learns vector representtaion of words model
# vectors: Numpy array containing all the word vectors 

(28322, 50)

In [52]:
vocabulary = model.wv.index_to_key #LIST OF ALL WORDS IN THE Word2Vec MODEL which are ORDERED BY 
                                   #THEIR FREQUENCY 

In [53]:
vocabulary

['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'br',
 'it',
 'in',
 'i',
 'this',
 'that',
 's',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'you',
 't',
 'on',
 'not',
 'he',
 'are',
 'his',
 'have',
 'be',
 'one',
 'all',
 'they',
 'at',
 'by',
 'who',
 'an',
 'from',
 'so',
 'like',
 'there',
 'or',
 'her',
 'just',
 'about',
 'out',
 'has',
 'if',
 'what',
 'some',
 'good',
 'can',
 'when',
 'more',
 'very',
 'she',
 'up',
 'no',
 'time',
 'even',
 'would',
 'their',
 'my',
 'which',
 'story',
 'only',
 'really',
 'see',
 'had',
 'were',
 'well',
 'me',
 'we',
 'than',
 'much',
 'bad',
 'been',
 'get',
 'people',
 'great',
 'into',
 'also',
 'do',
 'other',
 'first',
 'will',
 'him',
 'because',
 'most',
 'how',
 'don',
 'them',
 'make',
 'its',
 'made',
 'way',
 'could',
 'then',
 'too',
 'movies',
 'after',
 'any',
 'characters',
 'character',
 'films',
 'two',
 'think',
 'watch',
 'being',
 'plot',
 'many',
 'where',
 'never',
 'love',
 'seen',
 'little',
 'life',
 

In [55]:
model.wv['music'] #GIVES ALL THE VECTORS FOR WORD 'music'

array([-4.73219   ,  2.408087  ,  3.863363  , -4.1159096 ,  0.3741695 ,
        1.3023546 , -3.0225446 ,  1.1581209 , -1.3058033 , -2.671414  ,
       -1.0560062 ,  3.530641  ,  0.9945698 ,  3.7788906 ,  0.9222925 ,
       -0.26476088,  0.85745746,  2.0007896 , -2.95026   ,  2.3976102 ,
       -1.8307663 ,  1.3512079 ,  1.2442651 ,  2.2882664 ,  0.14494322,
       -0.4977762 ,  0.0840763 , -5.116792  ,  0.876437  , -0.6280354 ,
        0.38382345, -0.23917614, -2.3825047 , -1.2305008 ,  1.5575497 ,
       -0.37103546, -0.2017486 , -0.32215074, -0.753667  ,  1.7860441 ,
       -1.413256  , -1.3590682 ,  1.8309535 , -0.633103  ,  0.7318853 ,
        1.6824262 ,  0.8569138 ,  1.6107008 ,  3.5993202 ,  1.1400458 ],
      dtype=float32)

### TESTING THE MODEL....FINDING SIMILAR WORDS

In [56]:
model.wv.most_similar('great', topn=15)

[('fantastic', 0.8861823081970215),
 ('terrific', 0.8738081455230713),
 ('wonderful', 0.8710220456123352),
 ('fine', 0.8604158759117126),
 ('good', 0.8274661898612976),
 ('superb', 0.8033298254013062),
 ('brilliant', 0.8022701144218445),
 ('perfect', 0.7630297541618347),
 ('marvelous', 0.7509307265281677),
 ('nice', 0.7482746839523315),
 ('amazing', 0.7373433709144592),
 ('fabulous', 0.73481285572052),
 ('remarkable', 0.7243136167526245),
 ('solid', 0.7085562348365784),
 ('outstanding', 0.7070918679237366)]

### SAVE THE MODEL

In [65]:
from gensim.models import Word2Vec

In [72]:
# Save the model to a file
model.save(r'C:\Desktop\Full stack Data Science with AI\NLP\Practice\Word2Vec_Gensim(movie_data)\word2vec-movie-50.model')


### LOAD THE MODEL TO CHECK FOR NEW TEXT DATA

In [74]:
model1 = gensim.models.Word2Vec.load('word2vec-movie-50.model')

In [75]:
model1.wv.most_similar(positive=['king','man'], negative=['queen'])

[('scientist', 0.5881584882736206),
 ('batman', 0.5715816020965576),
 ('prophecy', 0.5678317546844482),
 ('aladdin', 0.5664018392562866),
 ('nemesis', 0.5570748448371887),
 ('joker', 0.5470787882804871),
 ('buio', 0.5334281921386719),
 ('gunman', 0.524756133556366),
 ('soldier', 0.5150951147079468),
 ('superhero', 0.5109474658966064)]

In [76]:
model.wv.most_similar(positive=['woman', 'prince'], negative=['man'])


[('princess', 0.7759032845497131),
 ('aunt', 0.7047412991523743),
 ('aurora', 0.6991126537322998),
 ('clare', 0.6924163103103638),
 ('belle', 0.6874961853027344),
 ('sara', 0.6823597550392151),
 ('daughter', 0.6788092851638794),
 ('maid', 0.6734063029289246),
 ('widow', 0.6729910969734192),
 ('kaurwaki', 0.6723647117614746)]