In [1]:
import re
import pandas as pd
import numpy as np
import scipy
from scipy import sparse
from collections import defaultdict
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/manika/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#read the train dataset
train = pd.read_csv(
    filepath_or_buffer='./train.dat', 
    header=None, 
    sep='\n')

#separate reviews from classes
vals = train.iloc[:,:].values

train_reviews=[]
train_labels=[]

for val in vals:
    train_labels.append(val[0][0:2])
    train_reviews.append(val[0][2:])
    
#read the test dataset
test = pd.read_csv(
    filepath_or_buffer='./test.dat', 
    header=None, 
    sep='\n')

vals = test.iloc[:,:].values

#separate reviews 
test_reviews=[]
for val in vals:
    test_reviews.append(val[0])

In [5]:
# remove html tags and numbers
def clean_review(review):
    #removing html tags
    cleanReview = re.sub(re.compile('<.*?>'), '', review)
    
    #removing digits
    cleanReview = re.sub(re.compile(r'\d.*?\d+'),'', cleanReview)
    #remove urls
    cleanReview = re.sub(r'http\S+', ' ', cleanReview)
    #remove punctuation except apostrophe
    cleanReview = re.sub(ur"[^\w\d'\s]+",' ',cleanReview)
    cleanReview = re.sub(r'(^[ \t]+|[ \t]+(?=:))', '', cleanReview, flags=re.M)
    
    return cleanReview

In [6]:
cleanedReviewsTrain = []
for review in train_reviews:
    cleanedReviewsTrain.append(clean_review(review))

In [7]:
#stemming and Lemmatization of reviews
from nltk.stem import PorterStemmer, WordNetLemmatizer

def stem_review(review):
    port = PorterStemmer()
    return " ".join([port.stem(i) for i in review.split()])

def lemmatized_review(review):
    wnl = WordNetLemmatizer()
    return " ".join([wnl.lemmatize(i, 'v') for i in review.split()])

In [8]:
stemmedReviewsTrain=[]
for review in cleanedReviewsTrain:
    stemmedReviewsTrain.append(stem_review(review))
    
lemmatizedReviewsTrain=[]
for review in stemmedReviewsTrain:
    lemmatizedReviewsTrain.append(lemmatized_review(review))

In [9]:
#lowercase the reviews
def lower_case(reviews):
    return [l.lower() for l in reviews]

#split the reviews into words
def split_reviews(reviews):
    return [l.split() for l in reviews]

In [10]:
# transform train reviews into lists of lowercase words
lowerReviewsTrain = lower_case(lemmatizedReviewsTrain)
splitReviewsTrain = split_reviews(lowerReviewsTrain)

In [11]:
# remove words with length < 4
def filterLen(reviews, minlen):
    r""" filter out terms that are too short. 
    docs is a list of lists, each inner list is a document represented as a list of words
    minlen is the minimum length of the word to keep
    """
    return [t for t in reviews if len(t) >= minlen ]

In [12]:
filteredReviewsTrain=[]
for review in splitReviewsTrain:
    filteredReviewsTrain.append(' '.join(filterLen(review, 4)))

In [16]:
from  sklearn.feature_extraction.text import TfidfVectorizer

#create csr matrix for train set
matrix_train = TfidfVectorizer(norm='l2', ngram_range=(1, 3))
csrTrain = matrix_train.fit_transform(filteredReviewsTrain)

In [17]:
csrTrain.shape

(25000, 4064519)

In [18]:
cleanedReviewsTest = []
for review in test_reviews:
    cleanedReviewsTest.append(clean_review(review))

In [19]:
#stemming and lemmatization of reviews
stemmedReviewsTest=[]
for review in cleanedReviewsTest:
    stemmedReviewsTest.append(stem_review(review))
    
lemmatizedReviewsTest=[]
for review in stemmedReviewsTest:
    lemmatizedReviewsTest.append(lemmatized_review(review))

In [20]:
# transform test reviews into lists of lowercase words
lowerReviewsTest = lower_case(lemmatizedReviewsTest)
splitReviewsTest = split_reviews(lowerReviewsTest)

In [21]:
#filter the small length words
filteredReviewsTest=[]
for review in splitReviewsTest:
    filteredReviewsTest.append(' '.join(filterLen(review, 4)))

In [25]:
#create csr matrix for test set
csrTest = matrix_train.transform(filteredReviewsTest)

In [27]:
csrTest.shape

(25000, 4064519)

In [29]:
#calculate similarity of train and test
similarity_matrix = np.dot(csrTest, csrTrain.T)

In [30]:
#convert csr to numpy array
similarity_array = similarity_matrix.toarray()

In [31]:
similarity_array.shape

(25000, 25000)

In [37]:
#find k- most similar reviews to every review
def kNearestNeighbours(array, k):
    return [np.argpartition(review, -k)[-k:] for review in similarity_array]

indices = kNearestNeighbours(similarity_array, 200)

#predict the output using weighted average
def predicted_values(indices):
    count = 0
    predicted_values = []

    for row in indices:
        values = [int(train_labels[i]) for i in row]
        weights = [similarity_array[count][i] for i in row]
        if np.sum(weights != 0):
            weighted_average = np.ma.average(values, weights=weights)
        if weighted_average > 0:
            predicted_values.append('+1')
        else:
            predicted_values.append('-1')
        count = count + 1

    f = open('prediction.dat','w')
    for prediction in predicted_values:
        print >>f, prediction
    f.close()
    
predicted_values(indices)