In [1]:
from bs4 import BeautifulSoup
import nltk
"""nltk.download('stopwords')
nltk.download('wordnet')"""
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re


In [8]:
def initialize(traindata, testdata):
    with open(traindata, "r") as file1:
        trainf = file1.readlines()
    with open(testdata, "r") as file2:
        testf = file2.readlines()
    tsentim = [i.split("\t", 1)[0] for i in trainf]
    trev = [j.split("\t", 1)[1] for j in trainf]
    return trev, testf, tsentim

In [9]:
def storeclean(reviews):
    cleanrev = []
    for i, rev in enumerate(reviews):
        cleanrev.append(clean(rev))
    return cleanrev

In [10]:
def clean(raw):
    text = BeautifulSoup(raw).get_text()
    rem_email= re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',text)
    rem_links = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]| \
        [a-z0-9.\-]+[.][a-z]{2,4}/|[a-z0-9.\-]+[.][a-z])(?:[^\s()<>]+|\(([^\s()<>]+| \
        (\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))','', rem_email)
    smileys = """:-) :) :o) :D :-D :( :-( :o(""".split()
    smileyPattern = "|".join(map(re.escape, smileys))
    letters = re.sub("[^a-zA-Z" + smileyPattern + "]", " ", rem_links)
    words = letters.lower().split()     
    stops = set(stopwords.words("english"))                
    lemmatizer = WordNetLemmatizer()
    fin_words = ''
    for word in words:
        if word not in stops and len(word) > 3:
            fin_words += str(lemmatizer.lemmatize(word)) + ' '
    return fin_words


In [11]:
def TFID(train_data, test_data):
    vectorizer = TfidfVectorizer(norm = 'l2')
    trainTFID = vectorizer.fit_transform(train_data)
    testTFID = vectorizer.transform(test_data)
    return trainTFID, testTFID

In [12]:
def similar(trainTFID, testTFID):
    cossim = np.dot(testTFID, np.transpose(trainTFID))
    similarities = cossim.toarray()
    return similarities

def KNN(similarity_vector, k):
    return np.argsort(-similarity_vector)[:k]
     
def predict(nearestNeighbors, labels):
    positiveReviewsCount = 0
    negativeReviewsCount = 0
    for neighbor in nearestNeighbors:
        if int(labels[neighbor]) == 1:
            positiveReviewsCount += 1
        else:
            negativeReviewsCount += 1
    if positiveReviewsCount > negativeReviewsCount:
        return 1
    else:
        return -1

In [13]:
train_reviews, test_reviews, train_sentiments = initialize('trainPR2.dat', 'testPR2.dat')
train_reviews = storeclean(train_reviews)
test_reviews = storeclean(test_reviews)

train_matrix, test_matrix = TFID(train_reviews, test_reviews)

In [14]:
similarities = similar(train_matrix, test_matrix)

In [15]:
k = 287
result = list()
for similarity in similarities:
    knn = KNN(similarity, k)
    prediction = predict(knn, train_sentiments)
    if prediction == 1:
        result.append('+1')
    else:
        result.append('-1')

In [16]:
output = open('pr2_output_v11.dat', 'w')
output.writelines( "%s\n" % i for i in result)
output.close()