In [2]:
import numpy as np

In [3]:
# using nltk tokenizer Function that returns list of bow for all movies in a given set(train/dev/test) of the given file root

from collections import defaultdict
from nltk.tokenize import WordPunctTokenizer        # splits all punctuations into separate tokens 
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

word_punct_tokenizer = WordPunctTokenizer()
wnl = WordNetLemmatizer()


def bow_movie_nltk(root,start,end):
    bow_per_movie = [] # list of dict where each element of bow_per_movie is bow for that movie
    for i in range(start,end):
        bow = defaultdict(float)
        string = ""
        for j in range(1,len(root[i])):
            string += root[i][j].text

        tokens = word_punct_tokenizer.tokenize(string)
        l_tokens = map(lambda t: t.lower(), tokens)
        
        ### Lemmatizing using wordnetlemmatizer
        l_tokens = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(l_tokens)]
        ###
        
        for token in l_tokens:
            bow[token] += 1.0
        bow_per_movie.append(bow)
    return bow_per_movie

In [4]:
# using nltk stop words, Function that returns vocab for all movies in the training set of given file root

from nltk.corpus import stopwords
def vocab_nltk(bow_per_movie):
    vocab = defaultdict(float)

    stop_words=set(stopwords.words('english'))
    
    #############################################
    for bow in bow_per_movie:                   #
        for key,value in bow.iteritems():       #    A: Code snippet to generate complete vocaublary
            vocab[key] += value                 #
    #############################################
    
    #############################################
    for key,value in vocab.items():             #
        if key in stop_words:                   #     B: Code snippet to remove stop words from complete vocabulary
            vocab.pop(key)                      #
    #############################################
    
    #############################################
    for key,value in vocab.items():             #
        if value<5:                             #     C: Code snippet to remove infrequent words from complete vocabulary
            vocab.pop(key)
    #############################################
    return vocab

In [15]:
import xml.etree.ElementTree as ET
tree = ET.parse('dataset\\movies-data-v1.0\\movies-data-v1.0\\perscreen-7domains-train-dev.tl.xml')
root_traindev_ps = tree.getroot()

In [18]:
bow_per_movie_train_ps = bow_movie_nltk(root_traindev_ps,0,1147)
bow_per_movie_test_ps = bow_movie_nltk(root_traindev_ps,1147,1464)

# Generating and storing  feature vectors based on vocabulary

In [23]:
# Function that returns feature vector for all movies in the given set(train/dev/test) of given file root
def fvec(bow_per_movie,vocab):
    fvec_per_movie = [] # list of lists where each element of fvec_per_movie is a feature vector for that movie

    for bow in bow_per_movie:
        fvec = []
        for key,value in vocab.iteritems():
            if key in bow:
                fvec.append(bow[key])
            else:
                fvec.append(0)

        fvec_per_movie.append(fvec)
    return fvec_per_movie

In [24]:
# Function that returns a list of target variables i.e. revenue for all movies in the given set(train/dev/test) of given file root
def true_rev(start,end,root):
    rev = []
    for i in range(start,end):
        rev.append(root[i][0].attrib['yvalue'])
    rev=np.array(rev).astype(np.float)
    return rev

# Extracting revenue generated by the movies in the training set
true_rev_train_ps = true_rev(0,1147,root_traindev_ps)

# Extracting revenue generated by the movies in the test set
true_rev_test_ps = true_rev(1147,1464,root_traindev_ps)

In [434]:
import pickle
vocab_ps = vocab_nltk(bow_per_movie_train_ps)
fvec_train_ps = fvec(bow_per_movie_train_ps,vocab_ps)
fvec_test_ps = fvec(bow_per_movie_test_ps,vocab_ps)

In [None]:
########################################### Training set feature vector #######################################################

In [336]:
# Feature vector for full vocabulary using nltk library
f = open('features//fvec_train_ps_nfsi.txt', 'w')
pickle.dump(fvec_train_ps, f)
f.close()

In [None]:
########################################### Testing set feature vector #######################################################

In [43]:
# Feature vector for full vocabulary using nltk library
f = open('features//fvec_test_ps_nfsi.txt', 'w')
pickle.dump(fvec_test_ps, f)
f.close()

In [435]:
print len(fvec_train_ps[0])
print len(fvec_dev_ps[0])

24861
24861


In [8]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import scipy
import numpy as np

def gsv(train_x,train_y):
    parameters = {'max_depth':[5, 100]}
    clf = GridSearchCV(DecisionTreeRegressor(), parameters)
    #clf = GridSearchCV(Ridge(), parameters)
    #clf = GridSearchCV(Lasso(), parameters)
    clf.fit(train_x, train_y)
    return clf

def train(train_x,train_y):
    clf =  DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_cweight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
    
    #clf = Ridge(alpha=0.01)
    #clf=  linear_model.Lasso(alpha=0.01)
    clf.fit(train_x, train_y)
    return clf

def predict(clf, test_x): 
    return clf.predict(test_x)

def cal_mae(y_hat,y):
    return np.mean(abs(y_hat-y))



# Predicting revenue in test set using Decision Tree Regressor

In [420]:
# Finding the best parameters for a particular Mahine Learning model using grid search
r = gsv(fvec_train_ps,true_rev_train_ps)
print r

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)


In [None]:
# Loading feature vector for training set
f= open('features//fvec_train_ps_nfsi.txt', 'r')
fvec_train_ps_nfsi= pickle.load(f)
f.close()

# Loading feature vector of testing set
g= open('features//fvec_test_ps_nfsi.txt', 'r')
fvec_dev_ps_nfsi= pickle.load(g)
g.close()

In [455]:
trained_classifier_nfsi = train(fvec_train_ps_nfsi,true_rev_train_ps)

In [456]:
pred_rev_test_ps = predict(trained_classifier,fvec_test_ps_nfsi) 
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

The mean absolute error for dev set is  7067.13994361


# Predicting revenue in test set using Deep Learning

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
    Dense(300, input_shape=(24861,)),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(1),
    Activation('linear'),
])

model.compile(loss='mse', optimizer='adam')
model.fit(fvec_train_ps_nfsi, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(fvec_test_ps_nfsi)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

# Generating and storing feature vectors based on positive/negative words

In [None]:
# lemmatizing positive-negative words

def lemmatization_for_list(l):
    new=[]
    for i,j in pos_tag(l):
        if j[0].lower() in ['a','n','v']:
            new.append(wnl.lemmatize(i,j[0].lower()))
        else:
            new.append(wnl.lemmatize(i))
    return set(new)
    

In [None]:
#reading the positive/ negative words
pos_list=open('features//positive-words.txt','r').readlines()
neg_list=open('features//negative-words.txt','r').readlines()

# refining it
for i in range(len(pos_list)):
    pos_list[i]=pos_list[i].replace('\n','')
    
for i in range(len(neg_list)):
    neg_list[i]=neg_list[i].replace('\n','')
    
positive=[]
negative=[]

## lemmatizing it
positive=lemmatization_for_list(pos_list)
negative=lemmatization_for_list(neg_list) 

In [None]:
## create the feature vector based on the absense and presence of pos-neg words
# For training set
feature_train=[]

for bow in bow_per_movie_train:
    feat=[]
    for pos in positive:
        if pos in bow.keys():
            feat.append(1)
        else:
            feat.append(0)
    for neg in negative:
        if neg in bow.keys():
            feat.append(-1)
        else:
            feat.append(0)
            
    feature_train.append(feat)
    
# For test set
feature_test=[]
for bow in bow_per_movie_dev:
    feat=[]
    for pos in positive:
        if pos in bow.keys():
            feat.append(1)
        else:
            feat.append(0)
    for neg in negative:
        if neg in bow.keys():
            feat.append(-1)
        else:
            feat.append(0)
            
    feature_test.append(feat)

In [None]:
# Writing training Feature vector based on positive/negative words to file
f = open('features//train_posneg_feat.txt', 'w')
pickle.dump(feature_train.append, f)
f.close()

# Writing testing Feature vector based on positive/negative words to file
f = open('features//test_posneg_feat.txt', 'w')
pickle.dump(feature_test.append, f)
f.close()

# Predicting revenue in test set using Ridge Regressor

In [None]:
trained_classifier_nfsi = train(feature_train,true_rev_train_ps)
pred_rev_test_ps = predict(trained_classifier,feature_test) 
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

# Predicting revenue in test set using Deep Learning

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
    Dense(300, input_shape=(5866,)),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(1),
    Activation('linear'),
])

model.compile(loss='mse', optimizer='adam')
model.fit(fvec_train_ps_nfsi, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(fvec_test_ps_nfsi)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

In [None]:
model.compile(loss='mse', optimizer='adam')
model.fit(feature_train, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(feature_test)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

# Generating and storing feature vectors based on polarity of reviews

In [None]:
# Function that returns list of bow for all movies in a given set(train/dev/test) of the given file root

from collections import defaultdict

def bow_movie(root,start,end):
    review_list=[]
    bow_per_movie = [] # list of dict where each element of bow_per_movie is bow for that movie
    for i in range(start,end):
        bow = defaultdict(float)
        string = ""
        for j in range(1,len(root[i])):
            string += root[i][j].text

        tokens =string.split()
        l_tokens = map(lambda t: t.lower(), tokens)

        for token in l_tokens:
            bow[token] += 1.0
        review_list.append(string)
        bow_per_movie.append(bow)
        
    return bow_per_movie,review_list


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

bow_per_movie_train_to, review_list_to = bow_movie(root_traindev_to,0,1147)
sid = SentimentIntensityAnalyzer()
review_polarity=[0]*len(review_list_to)
i=0
for review in review_list_to:
    ss = sid.polarity_scores(review)
    review_polarity[i]=ss['compound']
    i+=1

# Predicting revenue in test set using Ridge Regressor

In [None]:
trained_classifier_nfsi = train(review_polarity,true_rev_train_ps)
pred_rev_test_ps = predict(trained_classifier,feature_test) 
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

# Predicting revenue in test set using Deep Learning

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
    Dense(300, input_shape=(1,)),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(1),
    Activation('linear'),
])

model.compile(loss='mse', optimizer='adam')
model.fit(fvec_train_ps_nfsi, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(fvec_test_ps_nfsi)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

In [None]:
model.compile(loss='mse', optimizer='adam')
model.fit(review_polarity, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(feature_test)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

 # Generating and storing feature vectors based on parts of speech tags

In [None]:
from collections import OrderedDict
dict_a = {'CC':0,'CD':0,'DT':0,'EX':0,'FW':0,'IN':0,'JJ':0,'JJR':0,'JJS':0,'LS':0,'MD':0,'NN':0,'NNS':0,'NNP':0,'NNPS':0,'PDT':0,'POS':0,'PRP':0,'PRP$':0,'RB':0,'RBR':0,'RBS':0,'RP':0,'SYM':0,'TO':0,'UH':0,'VB':0,'VBD':0,'VBG':0,'VBN':0,'VBP':0,'VBZ':0,'WDT':0,'WP':0,'WP$':0,'WRB':0}
ordered_tag = OrderedDict(sorted(dict_a.items(), key=lambda t: t[0]))

In [None]:
from __future__ import division
import copy

def fvec_postag(bow_per_movie,ordered_tag):    # bow_per_movie is a list
    fvec_per_movie = [] # list of lists where each element of fvec_per_movie is a feature vector for that movie

    for bow in bow_per_movie:           # bow is a dict
        tag_dict= copy.copy(ordered_tag)
        for key,value in bow.iteritems():  
            
            temp = nltk.word_tokenize(key)
            
            try:
                tag_dict[pos_tag(temp)[0][1]] += value
            except:
                pass
        fvec_per_movie.append([j/len(bow) for j in tag_dict.values()])
    return fvec_per_movie

In [None]:
fvec_postag_train = fvec_postag(bow_per_movie_train_to,ordered_tag)
fvec_postag_test = fvec_postag(bow_per_movie_test_to,ordered_tag)

In [None]:
# Writing feature vector to file
f = open('features//fvec_postag_train.txt', 'w')
pickle.dump(fvec_postag_train, f)
f.close()

f = open('features//fvec_postag_test.txt', 'w')
pickle.dump(fvec_postag_test, f)
f.close()

# Predicting revenue in test set using Ridge Regressor

In [None]:
trained_classifier_nfsi = train(fvec_postag_train,true_rev_train_ps)
pred_rev_test_ps = predict(trained_classifier,fvec_postag_test) 
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

# Predicting revenue in test set using Deep Learning

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
    Dense(300, input_shape=(36,)),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(300),
    Activation('relu'),
    Dense(1),
    Activation('linear'),
])

model.compile(loss='mse', optimizer='adam')
model.fit(fvec_train_ps_nfsi, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(fvec_test_ps_nfsi)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

In [None]:
model.compile(loss='mse', optimizer='adam')
model.fit(fvec_postag_train, true_rev_train_ps, epochs=50)

pred_rev_test_ps = predict(fvec_postag_test)
mae_rev_test_ps= cal_mae(pred_rev_dev_ps,true_rev_test_ps)

print "The mean absolute error for test set is ", mae_rev_test_ps

#                                    END