In [None]:
import pymongo
import pandas as pd
import re
import numpy as np
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import time

[nltk_data] Downloading package punkt to /home/ds/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Connect to MongoDB and select the database
mongo = pymongo.MongoClient()
mongo_db = mongo.project
# Load the reviews from the reviews collection
data = pd.DataFrame(list(mongo_db.reviews.find({}, {'_id': False})))
#Close the connection
mongo.close()
print(data.shape)

(300294, 11)


In [None]:
data.head()

Unnamed: 0,movie,rating,genre,review_date,review_detail,review_id,review_summary,reviewer,spoiler_tag,helpful_upvotes,helpful_total
0,Satanic (2016),7,-,2 December 2017,I looked it up on IMDb after watching and was ...,rw3872552,An entertaining indie thriller film,bigblackclocks,0,2,4
1,Satanic (2016),4,-,23 August 2018,This wasn't anything really special. It had a ...,rw4299803,Just Ok,Foutainoflife,0,0,0
2,Satanic (2016),2,-,29 October 2017,I love horror movies & I was drawn to this mov...,rw3844666,Same but different...,samanthawhite,1,0,1
3,Satanic (2016),1,-,22 October 2017,I created an account here just for this movie....,rw3838706,I knew this would happen,edbthree,0,1,4
4,Satanic (2016),6,-,24 September 2016,Seeing the ratings on IMDb I wondered why the ...,rw3550183,Not a bad movie and a bit different,mjsreg,0,26,46


In [None]:
def remove_punctuation_and_numbers(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Apply the function to the 'review' column
data['review'] = (data['movie'] + " : " +data['review_detail'])

data['review'] = data['review'].apply(remove_punctuation_and_numbers)

data.head()

Unnamed: 0,movie,rating,genre,review_date,review_detail,review_id,review_summary,reviewer,spoiler_tag,helpful_upvotes,helpful_total,review
0,Satanic (2016),7,-,2 December 2017,I looked it up on IMDb after watching and was ...,rw3872552,An entertaining indie thriller film,bigblackclocks,0,2,4,Satanic I looked it up on IMDb after watchin...
1,Satanic (2016),4,-,23 August 2018,This wasn't anything really special. It had a ...,rw4299803,Just Ok,Foutainoflife,0,0,0,Satanic This wasnt anything really special I...
2,Satanic (2016),2,-,29 October 2017,I love horror movies & I was drawn to this mov...,rw3844666,Same but different...,samanthawhite,1,0,1,Satanic I love horror movies I was drawn to...
3,Satanic (2016),1,-,22 October 2017,I created an account here just for this movie....,rw3838706,I knew this would happen,edbthree,0,1,4,Satanic I created an account here just for t...
4,Satanic (2016),6,-,24 September 2016,Seeing the ratings on IMDb I wondered why the ...,rw3550183,Not a bad movie and a bit different,mjsreg,0,26,46,Satanic Seeing the ratings on IMDb I wondere...


In [None]:
# Preprocess text data and tokenize it into individual words
preprocessed_reviews = data['review'].apply(lambda x: word_tokenize(x.lower()))

start_time = time.time()
# Train Word2Vec model on preprocessed reviews
model = Word2Vec(preprocessed_reviews, vector_size=100, window=5, min_count=1, workers=4)

elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 179.16817426681519 seconds


In [None]:

# Get the vocabulary of the Word2Vec model
vocabulary = set(model.wv.index_to_key)

# Calculate the average vector for each review
review_embeddings = np.zeros((len(preprocessed_reviews), model.vector_size))
start_time = time.time()
for i, review in enumerate(preprocessed_reviews):
    word_embeddings = [model.wv[word] for word in review if word in vocabulary]
    if word_embeddings:
        review_embeddings[i] = np.mean(word_embeddings, axis=0)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

features = pd.DataFrame()
features['rating']=data['rating']
features['feature_vector'] = review_embeddings.tolist()


features.head()

Elapsed time: 106.81399464607239 seconds


Unnamed: 0,rating,feature_vector
0,7,"[1.0313384532928467, -0.5310494899749756, -0.1..."
1,4,"[0.8301123380661011, -0.3302663564682007, -0.3..."
2,2,"[0.5120816230773926, -0.13391651213169098, -0...."
3,1,"[0.5051181316375732, -0.43809765577316284, -0...."
4,6,"[0.8228296637535095, -0.28127551078796387, -0...."


In [None]:
features.to_csv('features.csv', index=False)