In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

spark = SparkSession.builder \
    .appName("LargeDataAnalytics") \
    .getOrCreate()
sc = spark.sparkContext
sc

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
import json
from pyspark.sql.functions import udf

In [11]:
from nltk.util import parallelize_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stopwords=(set(stopwords.words('english')))
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
data=sc.wholeTextFiles('/content/drive/MyDrive/movie_reviews')
data.collect()

[('file:/content/drive/MyDrive/movie_reviews/cv499_11407.txt',
  'the best thing -- in fact , the only good thing -- i can say about dark city is that it made me want to go see l . a . confidential again . \nor go rent body heat , to see william hurt do some real noir . \nor even -- god help me -- palmetto . \nalex proyas\'s new movie screams atmosphere . \ni screamed , too , " get me out of this theater ! " \nnew line cinema spent millions of dollars creating the expressionistic film noir visuals and paying the actors , and doesn\'t have any way to get it back . \ni paid $4 . 25 for a matinee screening and don\'t have any way to get my money back , either . \nyou\'ll hear a lot from other critics about the look and feel of this movie . \ni will admit that the cinematographers and costume designers and set artists and cgi graphics geeks all worked hard to create a stunning , nightmarish future world where it\'s always a late night in 1948 , complete with rotary phones and automats and 

In [16]:
from string import punctuation

file_data=data.map(lambda x: (x[0][42:], [word for word in word_tokenize(x[1]) if word.lower() not in stopwords and word not in punctuation]))
movie_table=spark.createDataFrame(file_data,['File_name','Review'])
movie_table.show()

+---------------+--------------------+
|      File_name|              Review|
+---------------+--------------------+
|cv499_11407.txt|[best, thing, --,...|
|cv500_10251.txt|[losing, job, unc...|
|cv500_10722.txt|[always, careful,...|
|cv501_11657.txt|[much, wanted, li...|
|cv501_12675.txt|[synopsis, easily...|
|cv502_10406.txt|[postman, deliver...|
|cv503_10558.txt|[certain, people,...|
|cv502_10970.txt|[blues, brothers,...|
|cv503_11196.txt|[michael, crichto...|
|cv504_29120.txt|[american, pie, 2...|
|cv504_29243.txt|[guess, 's, credi...|
|cv505_12090.txt|[critics, includi...|
|cv505_12926.txt|[well, going, exp...|
|cv506_15956.txt|[much, ballyhoo, ...|
|cv506_17521.txt|[film, extraordin...|
| cv507_9220.txt|[capsule, side-sp...|
|cv508_16006.txt|[maybe, important...|
| cv507_9509.txt|[midway, ``, anac...|
|cv508_17742.txt|[70, 's, nostalgi...|
|cv509_15888.txt|[write, review, n...|
+---------------+--------------------+
only showing top 20 rows



In [17]:
positive_words=sc.textFile('/content/drive/MyDrive/pos.txt').collect()
negative_words=sc.textFile('/content/drive/MyDrive/neg.txt').collect()

In [23]:
def Sentiment(review):
  pos,neg=0,0
  for word in review:
    if word in positive_words:
      pos+=1
    if word in negative_words:
      neg+=1
  if pos>neg:
    return 'Positive'
  elif neg>pos:
    return 'Negative'
  else:
    return 'Neutral'

In [29]:
sentiment=udf(Sentiment)
sentiment_table=movie_table.withColumn('Sentiment',sentiment(movie_table['review']))
sentiment_table.show()

+---------------+--------------------+---------+
|      File_name|              Review|Sentiment|
+---------------+--------------------+---------+
|cv499_11407.txt|[best, thing, --,...| Negative|
|cv500_10251.txt|[losing, job, unc...| Negative|
|cv500_10722.txt|[always, careful,...| Positive|
|cv501_11657.txt|[much, wanted, li...| Negative|
|cv501_12675.txt|[synopsis, easily...| Negative|
|cv502_10406.txt|[postman, deliver...| Negative|
|cv503_10558.txt|[certain, people,...| Positive|
|cv502_10970.txt|[blues, brothers,...| Negative|
|cv503_11196.txt|[michael, crichto...| Positive|
|cv504_29120.txt|[american, pie, 2...| Negative|
|cv504_29243.txt|[guess, 's, credi...| Positive|
|cv505_12090.txt|[critics, includi...| Negative|
|cv505_12926.txt|[well, going, exp...| Negative|
|cv506_15956.txt|[much, ballyhoo, ...| Positive|
|cv506_17521.txt|[film, extraordin...| Negative|
| cv507_9220.txt|[capsule, side-sp...| Negative|
|cv508_16006.txt|[maybe, important...| Positive|
| cv507_9509.txt|[mi

In [31]:
sentiment_table.filter(sentiment_table['Sentiment']=='Positive').show()

+---------------+--------------------+---------+
|      File_name|              Review|Sentiment|
+---------------+--------------------+---------+
|cv500_10722.txt|[always, careful,...| Positive|
|cv503_10558.txt|[certain, people,...| Positive|
|cv503_11196.txt|[michael, crichto...| Positive|
|cv504_29243.txt|[guess, 's, credi...| Positive|
|cv506_15956.txt|[much, ballyhoo, ...| Positive|
|cv508_16006.txt|[maybe, important...| Positive|
|cv509_15888.txt|[write, review, n...| Positive|
|cv509_17354.txt|[synopsis, nice, ...| Positive|
|cv511_10132.txt|['m, avid, fan, `...| Positive|
|cv511_10360.txt|[14, years, ago, ...| Positive|
| cv513_6923.txt|[first, heard, ro...| Positive|
|cv514_12173.txt|[n't, hate, big, ...| Positive|
|cv515_17069.txt|[since, 1996, 's,...| Positive|
|cv515_18484.txt|[film, features, ...| Positive|
|cv516_11172.txt|[luckily, people,...| Positive|
|cv517_19219.txt|[happy, bastard, ...| Positive|
|cv518_14798.txt|[well, check, sco...| Positive|
|cv518_13331.txt|[fi

In [32]:
sentiment_table.filter(sentiment_table['Sentiment']=='Negative').show()

+---------------+--------------------+---------+
|      File_name|              Review|Sentiment|
+---------------+--------------------+---------+
|cv499_11407.txt|[best, thing, --,...| Negative|
|cv500_10251.txt|[losing, job, unc...| Negative|
|cv501_11657.txt|[much, wanted, li...| Negative|
|cv501_12675.txt|[synopsis, easily...| Negative|
|cv502_10406.txt|[postman, deliver...| Negative|
|cv502_10970.txt|[blues, brothers,...| Negative|
|cv504_29120.txt|[american, pie, 2...| Negative|
|cv505_12090.txt|[critics, includi...| Negative|
|cv505_12926.txt|[well, going, exp...| Negative|
|cv506_17521.txt|[film, extraordin...| Negative|
| cv507_9220.txt|[capsule, side-sp...| Negative|
| cv507_9509.txt|[midway, ``, anac...| Negative|
|cv508_17742.txt|[70, 's, nostalgi...| Negative|
|cv510_23360.txt|[victims, fate, l...| Negative|
|cv510_24758.txt|[well, lets, see,...| Negative|
|cv512_15965.txt|[may, seem, weird...| Negative|
|cv514_11187.txt|[krippendorf, 's,...| Negative|
| cv513_7236.txt|[ca

In [33]:
sentiment_table.filter(sentiment_table['Sentiment']=='Neutral').show()

+---------------+--------------------+---------+
|      File_name|              Review|Sentiment|
+---------------+--------------------+---------+
|cv512_17618.txt|[plunkett, maclea...|  Neutral|
|cv519_14661.txt|[many, people, fi...|  Neutral|
|cv524_24885.txt|[following, revie...|  Neutral|
| cv540_3421.txt|[starting, write,...|  Neutral|
|cv558_29376.txt|[synopsis, lifelo...|  Neutral|
| cv626_7907.txt|[``, tina, fetch,...|  Neutral|
|cv650_14340.txt|[james, cmaeron, ...|  Neutral|
|cv654_19345.txt|[think, people, e...|  Neutral|
|cv668_17604.txt|[december, 1996, ...|  Neutral|
|cv673_25874.txt|[one, brother, 's...|  Neutral|
|cv693_19147.txt|[nothing, unabash...|  Neutral|
|cv702_12371.txt|[1990, surprise, ...|  Neutral|
| cv719_5581.txt|[well, guess, 's,...|  Neutral|
|cv750_10606.txt|[disney, 's, ``, ...|  Neutral|
|cv808_13773.txt|[stephen, please,...|  Neutral|
| cv813_6649.txt|[steven, seagal, ...|  Neutral|
| cv817_3675.txt|['re, back, blade...|  Neutral|
|cv834_23192.txt|[``