In [8]:
# import pandas to read data frames
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# for sql queries
from pyspark.sql import SparkSession
from pyspark.sql import Row
import types
from pyspark.sql.types import *
from pyspark import SparkContext 
sc = SparkContext.getOrCreate() 
spark = SparkSession.builder.getOrCreate()

# for NLP
import re #regular expressions
import nltk 
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')
import string

# for train test split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package omw-1.4 to /home/pe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# read data file
movie_df = pd.read_csv('./data/imdb-dataset.csv')
movie_df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
# remove duplicates
movie_df['dup'] = movie_df.duplicated(subset=None, keep='first')
del movie_df['dup']

In [5]:
# Applying SQL operations to create data frame
classNameContent = StructType([StructField("review", StringType(), True),
                            StructField("sentiment",  StringType(), True)])
FinalDataSet = spark.createDataFrame(movie_df, classNameContent)
FinalDataSet.createTempView("MovieReviews")

In [6]:
# Check for class balanced nature
print("Total number of Reviews: " + str(FinalDataSet.count()) )
spark.sql(
    "select sentiment, count(sentiment) as count " +
    "from MovieReviews " +
    "group by sentiment "
    "order by sentiment limit 20" ).show()

[Stage 0:>                                                          (0 + 1) / 8]

22/12/16 12:43:05 WARN TaskSetManager: Stage 0 contains a task of very large size (8022 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

Total number of Reviews: 50000
22/12/16 12:43:13 WARN TaskSetManager: Stage 3 contains a task of very large size (8022 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+---------+-----+
|sentiment|count|
+---------+-----+
| negative|25000|
| positive|25000|
+---------+-----+



In [9]:
# apply pre processing 
sw = stopwords.words('english') # call stopwords from nltk
lemmatizer = WordNetLemmatizer() # call Lemmatisation from nltk

# get a customised stopwords list
stop_words_file = './data/SmartStoplist.txt' 
stop_words = []
with open(stop_words_file, "r") as f:
    for line in f:
        stop_words.extend(line.split())      
stop_words = stop_words  

# defining the preprocessing function
def preprocess(text):
    
    text = text.lower() #to convert into lowercase
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^,' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw] #removing stopwords
    
    
    text = [lemmatizer.lemmatize(word) for word in text if lemmatizer.lemmatize(word) not in stop_words]
    text = " ".join(text) #Lemmatisation 
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [11]:
# Apply preprocessing to review column
movie_df['prep'] = movie_df['review'].apply(lambda x: preprocess(x)) 
del movie_df['review'] # remove review column
movie_df.head(5)

Unnamed: 0,sentiment,prep
0,positive,reviewer mentioned watching oz episode hooked ...
1,positive,wonderful production filming technique unassum...
2,positive,thought wonderful spend time hot summer weeken...
3,negative,basically family boy jake zombie closet parent...
4,positive,petter mattei love time money visually stunnin...


In [12]:
# sql query to store the pre preprosessed data 
FinalDataSet=spark.createDataFrame(movie_df) 
FinalDataSet.printSchema()
FinalDataSet.show(2)

root
 |-- sentiment: string (nullable = true)
 |-- prep: string (nullable = true)

22/12/16 12:55:42 WARN TaskSetManager: Stage 6 contains a task of very large size (4066 KiB). The maximum recommended task size is 1000 KiB.
+---------+--------------------+
|sentiment|                prep|
+---------+--------------------+
| positive|reviewer mentione...|
| positive|wonderful product...|
+---------+--------------------+
only showing top 2 rows



In [13]:
# Convert sentiment into binary values
classes=["negative", "positive"]
classIx=[0,1]
classLookupMap=dict(zip(classes,classIx))

In [14]:
# Tokenize the content and convert the sentiment to a number
# Convert content to array of words
AllTokens_df = FinalDataSet.rdd.map(lambda text: Row(sentiment=classLookupMap[text[0]],prep=re.findall(r"[\w']+" ,text[1].lower())) ).toDF()

AllTokens_df.registerTempTable("allTokens")
AllTokens_df.printSchema()
AllTokens_df.show(2)

22/12/16 12:56:06 WARN TaskSetManager: Stage 7 contains a task of very large size (4066 KiB). The maximum recommended task size is 1000 KiB.
root
 |-- sentiment: long (nullable = true)
 |-- prep: array (nullable = true)
 |    |-- element: string (containsNull = true)

22/12/16 12:56:07 WARN TaskSetManager: Stage 8 contains a task of very large size (4066 KiB). The maximum recommended task size is 1000 KiB.
+---------+--------------------+
|sentiment|                prep|
+---------+--------------------+
|        1|[reviewer, mentio...|
|        1|[wonderful, produ...|
+---------+--------------------+
only showing top 2 rows



In [15]:
X_train, X_test , y_train, y_test = train_test_split(movie_df['prep'].values,
                                                     movie_df['sentiment'].values,test_size=0.2,
                                                     random_state=42,stratify=movie_df['sentiment'].values)



In [17]:
print(len(X_train))
print(len(X_test))

40000
10000
