In [1]:
import pandas as pd
import numpy as np
import re
import random
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

!pip install spacytextblob
!pip install spacy
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("spacytextblob")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




<spacytextblob.spacytextblob.SpacyTextBlob at 0x78a14c53a9b0>

In [2]:
# Read data from imdb
df_imdb = pd.read_csv('imdb_review.csv')

df_imdb["rating"] = pd.to_numeric(df_imdb["rating"], errors='coerce')
df_imdb["sentiment"] = np.where(df_imdb["rating"] >= 8, 'positive', 'negative')
df_imdb = df_imdb.sample(frac=1, random_state=1).reset_index(drop=True)
df_imdb.sentiment = [1 if s == 'positive' else 0 for s in df_imdb.sentiment]

# Read data from rottentomatoes
df_rt = pd.read_csv('rottentomatoes_reviews.csv')

df_rt["rating"] = pd.to_numeric(df_rt["rating"], errors='coerce')
df_rt = df_rt[df_rt["rating"] != 3]
df_rt['rating'] = df_rt['rating'].apply(lambda x: x*2)
df_rt["sentiment"] = np.where(df_rt["rating"] >= 8, 'positive', 'negative')
df_rt = df_rt.sample(frac=1, random_state=1).reset_index(drop=True)
df_rt.sentiment = [1 if s == 'positive' else 0 for s in df_rt.sentiment]

# Combine data from imdb and rottentomatoes
df = pd.concat([df_imdb, df_rt], ignore_index=True)
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
df.to_csv('reviews.csv', index=False)
print(df.head())

   rating                                             review  \
0      10  I had wanted to see this film from the moment ...   
1      10  If you call common sense, and a human heart yo...   
2      10  George Miller's masterpiece Mad Max:Fury Road ...   
3      10  One of the early "historical epics" to set off...   
4      10  Music is banned in Miguel's household after hi...   

                    movie_name  sentiment  
0             Khách Sạn Rwanda          1  
1              Oldboy: Báo Thù          1  
2  Max Điên: Con Đường Tử Thần          1  
3            Trái Tim Dũng Cảm          1  
4        Coco: Hội Ngộ Diệu Kỳ          1  


In [3]:
# Read details data from imdb and rottentomatoes
df_details_imdb = pd.read_csv('imdb_details.csv')
df_details_rt = pd.read_csv('rottentomatoes_details.csv')
df_details = pd.concat([df_details_imdb, df_details_rt], ignore_index=True)
df_details.head()

df_details.to_csv('details_data.csv')

In [4]:
# Data preprocessing
#Remove stopwords using nltk
stop_words = set(stopwords.words("english"))
def remove_stopwords(s):
    words = word_tokenize(s)
    lst = [word for word in words if word not in stop_words]
    return ' '.join(lst)

def lemmatization(text):
    token = nlp(text)
    text = [word.lemma_ for word in token]
    return ' '.join(text)

def transform(s):
    s = s.lower().strip()
    s = re.sub('http\S+', ' ', s)
    s = re.sub('[,\.!?:()"]', '', s)
    s = re.sub('<.*?>', ' ', s)
    s = re.sub('[^a-zA-Z0-9]', ' ', s)
    s = re.sub('\s+', ' ', s)
    s = lemmatization(s)
    s = remove_stopwords(s)
    return s

In [5]:
df['review'] = df['review'].apply(transform)

In [6]:
df.head()

Unnamed: 0,rating,review,movie_name,sentiment
0,10,I want see film moment I see snippet bbc ameri...,Khách Sạn Rwanda,1
1,10,call common sense human heart like challenge m...,Oldboy: Báo Thù,1
2,10,george miller masterpiece mad maxfury road def...,Max Điên: Con Đường Tử Thần,1
3,10,one early historical epic set still current wa...,Trái Tim Dũng Cảm,1
4,10,music ban miguel household musical great grand...,Coco: Hội Ngộ Diệu Kỳ,1


In [7]:
df.to_csv('reviews_data.csv')