In [None]:
import os
import numpy as np
from data_processing import preprocess_data, get_dictionary, featurize_data
from cross_validation import cross_validation, evaluate_classifier
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid
import pickle
import nltk
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os
from nltk.tokenize import word_tokenize
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
import itertools

## Read Data

In [None]:
data_path = 'datasets/data/'

In [None]:
def preprocess_data(datapath):
    X = []
    y = []
    filenames = []
    for temp,j,_ in os.walk(datapath):
            for f in sorted(os.listdir(temp)):
                if f.endswith('.tag'):
                    with open(os.path.join(temp, f), 'r') as demo:
                        x = []
                        for l in demo:
                            if l == '\n':
                                continue
                            token, _ = l.split('\t')
                            x.append(token)
                    label = 1 if os.path.basename(os.path.normpath(temp)) == 'POS' else 0
                    filenames.append(f)
                    X.append(x)
                    y.append(label)
    return np.array(X), np.array(y), filenames                 

In [None]:
X, y, filenames = preprocess_data(data_path)

## Model Selection

In [None]:
k = 10
folds = [ [] for i in range(k)]
for index in range(len(y)):
    folds[index%k].append(index)
test_idxs = folds[0]
train_idxs = list(set(np.concatenate(folds)) - set(test_idxs))
X_train = X[train_idxs]
y_train = y[train_idxs]
X_test = X[test_idxs]
y_test = y[test_idxs]

In [None]:
def cross_validation(X, y, model, k=9):

    folds = [ [] for i in range(k)]
    for index in range(len(y)):
        folds[index%k].append(index)

    y_pred = []
    y_val_labels = []        
    accuracies = []

        
    for val in range(k):
        
        val_idxs = folds[val]
        train_idxs = list(set(np.concatenate(folds)) - set(val_idxs))
        X_train = X[train_idxs]
        y_train = y[train_idxs]
        X_val = X[val_idxs]
        y_val = y[val_idxs]

        model.fit(X_train, y_train)
        pred = model.predict(X_val)
        y_pred = np.concatenate([y_pred, pred])
        y_val_labels = np.concatenate([y_val_labels, y_val])
        accuracies.append((pred == y_val).mean())

    print("Average accuracy is {}(variance {})\n".format(np.mean(accuracies), np.var(accuracies)))

    return y_pred, y_val_labels

In [None]:
doc_embeddings = []
for f in os.listdir("doc2vec"):
    if f.endswith('.model'):
        doc_embeddings.append("doc2vec/"+f)

print(doc_embeddings)

In [None]:
C_range = np.logspace(-2, 10, 10)
gamma_range = np.logspace(-9, 3, 13)
print(C_range)

In [None]:
SVM_with_embeddings_param_grid = {
                                    'doc_embeddings': doc_embeddings, 
                                    'svm_kernel' : ['rbf', 'linear', 'poly'],
                                    'C': [0.1, 1, 10],
                                    'gamma': [1e-3, 1e-4]
                                 }

grid = ParameterGrid(SVM_with_embeddings_param_grid)
for params in grid:
    print(params)
    
    
    doc2vec = Doc2Vec.load(params['doc_embeddings'])
    X_features = np.array([doc2vec.infer_vector(x) for x in X_train])
    if params['svm_kernel'] != 'linear':
        model = SVC(kernel=params['svm_kernel'], C=params['C'], gamma=params['gamma'])
    else:
        model = SVC(kernel=params['svm_kernel'], C=params['C'])


    total_y_pred, total_y_test= cross_validation(X_features, y_train, model)
    
    model_results = {}
    model_results['predictions'] = total_y_pred
    model_results['labels'] = total_y_test

    pickle.dump(model_results, open("svm/{}_{}_{}.pkl".format(params['svm_kernel'], params['C'], params['gamma'], Path(params['doc_embeddings']).stem), 'wb'))
    

In [None]:
SVM_with_embeddings_param_grid = {
                                    'svm_kernel' : ['rbf', 'linear', 'poly'],
                                    'C': [0.1, 1, 10],
                                    'gamma': [1e-3, 1e-4]
                                 }

grid = ParameterGrid(SVM_with_embeddings_param_grid)
for params in grid:
    print(params)
    
    
    doc2vec1 = Doc2Vec.load("doc2vec/dbow_dm_model1.model")
    doc2vec2 = Doc2Vec.load("doc2vec/dbow_dm_model2.model")
    dbow_dm = ConcatenatedDoc2Vec([doc2vec1, doc2vec2])



    X_features = np.array([dbow_dm.infer_vector(x) for x in X_train])
    if params['svm_kernel'] != 'linear':
        model = SVC(kernel=params['svm_kernel'], C=params['C'], gamma=params['gamma'])
    else:
        model = SVC(kernel=params['svm_kernel'], C=params['C'])


    total_y_pred, total_y_test= cross_validation(X_features, y_train, model)
    
    model_results = {}
    model_results['predictions'] = total_y_pred
    model_results['labels'] = total_y_test

    #pickle.dump(model_results, open("svm/{}_{}_{}.pkl".format(params['svm_kernel'], params['C'], params['gamma'], Path(params['doc_embeddings']).stem), 'wb'))
    

In [None]:
#dm
doc2vec = Doc2Vec.load('doc2vec/10_100_15_1_2_0_15.model')
X_test_final = np.array([doc2vec.infer_vector(x) for x in X_test])
X_features = np.array([doc2vec.infer_vector(x) for x in X_train])
model = SVC(kernel='rbf', C=10, gamma=0.001)
model.fit(X_features, y_train)
y_pred_dm = np.array(model.predict(X_test_final))
print(((y_pred_dm == np.array(y_test)).mean()))


#dbow
doc2vec = Doc2Vec.load("doc2vec/10_200_10_0_2_0_5.model")
X_test_final = np.array([doc2vec.infer_vector(x) for x in X_test])
X_features = np.array([doc2vec.infer_vector(x) for x in X_train])
model = SVC(kernel='rbf', C=10, gamma=0.001)
model.fit(X_features, y_train)
y_pred_dbow = np.array(model.predict(X_test_final))
print(((y_pred_dbow == np.array(y_test)).mean()))


#for dm + dbow
doc2vec1 = Doc2Vec.load("doc2vec/dbow_dm_model1.model")
doc2vec2 = Doc2Vec.load("doc2vec/dbow_dm_model2.model")
dbow_dm = ConcatenatedDoc2Vec([doc2vec1, doc2vec2])

X_features = np.array([dbow_dm.infer_vector(x) for x in X_train])
X_test_final = np.array([dbow_dm.infer_vector(x) for x in X_test])
model = SVC(kernel='linear', C=0.1, gamma=0.001)
model.fit(X_features, y_train)
y_pred_pv_concat = np.array(model.predict(X_test_final))
print(((y_pred_pv_concat == np.array(y_test)).mean()))


In [None]:
X_train_ = []
X_test_ = []
for x in X_train:
    review = ' '.join(x)
    X_train_.append(review)
for x in X_test:
    review = ' '.join(x)
    X_test_.append(review)

### bit more efficient than my own implementation, speed up things

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b\w+\b', min_df=1)
X_train_grams = vectorizer.fit_transform(X_train_)
X_test_grams = vectorizer.transform(X_test_)
model = SVC(kernel='linear', C=10, gamma=0.0001)
model.fit(X_train_grams, y_train)
y_pred_uni_freq = np.array(model.predict(X_test_grams))
print(((y_pred_uni_freq == np.array(y_test)).mean()))

In [None]:
vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1)
X_train_grams = vectorizer.fit_transform(X_train_)
X_test_grams = vectorizer.transform(X_test_)
model = SVC(kernel='linear',C=0.1, gamma=0.0001)
model.fit(X_features, y_train)
y_pred_bi_freq = np.array(model.predict(X_test_final))
print(((y_pred_bi_freq == np.array(y_test)).mean()))


In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
X_train_grams = vectorizer.fit_transform(X_train_)
X_test_grams = vectorizer.transform(X_test_)
model = SVC(kernel='linear', C=0.1, gamma=0.001)
model.fit(X_features, y_train)
y_pred_freq_concat = np.array(model.predict(X_test_final))
print(((y_pred_freq_concat == np.array(y_test)).mean()))


In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b\w+\b', min_df=1, binary = True)
X_train_grams = vectorizer.fit_transform(X_train_)
X_test_grams = vectorizer.transform(X_test_)
model = SVC(kernel='linear', C=10, gamma=0.0001)
model.fit(X_features, y_train)
y_pred_uni_bin = np.array(model.predict(X_test_final))
print(((y_pred_uni_bin == np.array(y_test)).mean()))


In [None]:
vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1, binary = True)
X_train_grams = vectorizer.fit_transform(X_train_)
X_test_grams = vectorizer.transform(X_test_)
model = SVC(kernel='linear', C=0.1, gamma=0.0001)
model.fit(X_features, y_train)
y_pred_bi_bin = np.array(model.predict(X_test_final))
print(((y_pred_bi_bin == np.array(y_test)).mean()))


In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1, binary = True)
X_train_grams = vectorizer.fit_transform(X_train_)
X_test_grams = vectorizer.transform(X_test_)
model = SVC(kernel='linear', C=1, gamma=0.001)
model.fit(X_features, y_train)
y_pred_concat_bin = np.array(model.predict(X_test_final))
print(((y_pred == np.array(y_test)).mean()))


In [None]:
models = [y_pred_dm, y_pred_dbow, y_pred_pv_concat,
            y_pred_uni_freq, y_pred_bi_freq, y_pred_freq_concat, 
              y_pred_uni_bin, y_pred_bi_bin, y_pred_concat_bin]


    
for models in itertools.combinations(models, 2):
    permutation_test_(models[0], models[1], y_test)

In [None]:
def permutation_test_(y1, y2, y_true):
    y1 = [1 if y1[i] == y_true[i] else 0 for i in range(len(y_true))]
    y2 = [1 if y2[i] == y_true[i] else 0 for i in range(len(y_true))]                  
    y1_test = y1 == y_true
    y2_test = y2 == y_true
    samples = 0
    r=5000
    for i in range(r):
        flips = np.random.randint(2, size=len(y1))
        y1_t = [y1[j] if flips[j] == 0 else y2[j] for j in range(len(y1))]
        y2_t = [y2[j] if flips[j] == 0 else y1[j] for j in range(len(y1))]
        diff = np.abs(np.mean(y1_t) - np.mean(y2_t))
        if diff >= np.abs(np.mean(y1) - np.mean(y2)):
            samples += 1
    print((greater_samples + 1.0) / (r + 1.0))

In [None]:
permutation_test_(total_y_pred_dm, total_y_pred_bigram, total_y_test)

In [None]:
permutation_test_(total_y_pred_dm, total_y_pred_bigram, total_y_test)

In [None]:
SVM_with_bow_param_grid = {
                                    'svm_kernel' : ['rbf', 'linear', 'poly'],
                                    'unigrams': [True, False], 
                                    'bigrams' : [True, False],
                                    'C': [0.1, 1, 10],
                                    'gamma': [1e-3, 1e-4]
                          }
grid = ParameterGrid(SVM_with_bow_param_grid)
for params in grid:
    print(params)
    if params['svm_kernel'] != 'linear':
        model = SVC(kernel=params['svm_kernel'], C=params['C'], gamma=params['gamma'])
    else:
        model = SVC(kernel=params['svm_kernel'], C=params['C'])

        total_y_pred, total_y_test= cross_validation(X_train, y_train, model)

    model_results = {}
    model_results['predictions'] = total_y_pred
    model_results['labels'] = total_y_test
    #pickle.dump(model_results, open("svm/{}_{}_{}_{}.pkl".format(params['svm_kernel'], params['C'], params['gamma'], params['unigrams'], params['bigrams']), 'wb'))

In [None]:
model_accuracies = []

for f in os.listdir("svm"):
    if f.endswith('.pkl'):
        with (open("svm/"+f, "rb")) as openfile:
            model_accuracies.append(pickle.load(openfile))

In [None]:
model_accuracies

In [None]:
type(X_val)

In [None]:
doc2vec = Doc2Vec.load("doc2vec/10_200_10_0_2_0_5.model")
X_features = np.array([doc2vec.infer_vector(x) for x in X_train])
X_test_final = np.array([doc2vec.infer_vector(x) for x in X_test])


model = SVC(kernel='rbf', C=10, gamma=0.001)
model.fit(X_features, y_train)
y_pred = np.array(model.predict(X_test_final))
print(((y_pred == np.array(y_test)).mean()))

In [None]:
test = np.nonzero(np.equal(y_pred,y_test)==False)
print(test)
print(len(test))

In [None]:
test = np.nonzero(np.equal(y_pred,y_test)==False)
print(test)
print(len(test))

In [None]:
test2 = np.array(test_idxs)
for i in test2:
    print(filenames[i])

In [None]:
X_test[test]

In [None]:
counter = 0
for x in X_val[test]:
    for i in x:
        if i =='not' or i=='but':
            counter += 1
            break

print(counter)
               

In [None]:
negation = []
labels = []
with open("datasets/sentiment_negation.txt", 'r') as demo:
    for l in demo:
        text = nltk.word_tokenize(l[2:])
        text = [x.lower() for x in text]
        negation.append(text)
        if l[0] == '0' or l[0] == '1':
            labels.append(0)
        else:
            labels.append(1)
print(negation)

In [None]:
X_val_final = np.array([doc2vec.infer_vector(x) for x in negation])
y_pred = model.predict(X_val_final)
print(y_pred)
print(labels)
print(((y_pred == labels).mean()))

## Deployment Test

In [None]:
df = pd.read_csv('datasets/new_movies.csv') 

In [None]:
df.ix[:,0].to_numpy()

In [None]:
X_deployment = df.ix[:,1]
y_deployment = df.ix[:,0].to_numpy()

In [None]:
new = X_deployment.tolist()
reviews = []
for item in new:
    text = nltk.word_tokenize(item)
    text = [x.lower() for x in text]
    reviews.append(text)
print(len(reviews))
print(len(y_deployment))

In [None]:
doc2vec = Doc2Vec.load('doc2vec/10_200_10_0_2_0_5.model')
X_test_deployment = np.array([doc2vec.infer_vector(x) for x in reviews])
y_pred = model.predict(X_test_deployment)
print(((y_pred == y_deployment).mean()))