# Homeworks

Base code for basic data we used during the whole week

In [14]:
#Import statements - To keed this in order
from nltk.corpus import opinion_lexicon
import urllib.request, os, gzip
import  json
import random
import numpy # a powerfull module
from nltk.corpus import stopwords # We use it to remove the stopwords of the comments since they dont provide relevant info
from nltk.tokenize import sent_tokenize, word_tokenize # Divide text in sentences and then in words
from sklearn.linear_model import LinearRegression # sklearn is a machine learning toolkit (needs numpy, scipy and matplotlib)

In [2]:
%matplotlib inline
datadir = './data/'

def download_data(dataset_name, datadir):
    filename = 'reviews_%s_5.json' % dataset_name
    filepath = os.path.join(datadir, filename)
    if os.path.exists(filepath):
        print("Dataset %s has already been downloaded to %s" % (dataset_name, datadir))
    else:
        url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/%s.gz' % filename
        urllib.request.urlretrieve(url, filepath + ".gz")
        with gzip.open(filepath + ".gz", 'rb') as fin:
            with open(filepath, 'wb') as fout:
                fout.write(fin.read())
        print("Downloaded dataset %s and saved it to %s" % (dataset_name, datadir))

dataset = "Baby"
download_data(dataset, datadir)

Dataset Baby has already been downloaded to ./data/


In [3]:
def  load_data (dataset_name, datadir):
    filepath = os.path.join(datadir, 'reviews_%s_5.json' % dataset_name)
    if not os.path.exists(filepath):
        download_data(dataset_name, datadir)
    data = []
    with open(filepath, 'r') as f:
        for line in f:                            # read file line by line
            item_hash = hash(line)                # we will use this later for partitioning our data 
            item = json.loads(line)               # convert JSON string to Python dict
            item['hash'] = item_hash              # add hash for identification purposes
            data.append(item)
    print("Loaded %d data for dataset %s" % (len(data), dataset_name))
    return data

# load the data...
baby = load_data(dataset, datadir)

Loaded 160792 data for dataset Baby


In [4]:
def partition_train_validation_test(data):
    # 60% : modulus is 0, 1, 2, 3, 4, or 5
    data_train = [item for item in data if item['hash']%10<=5]  
    # 20% : modulus is 6 or 7
    data_valid = [item for item in data if item['hash']%10 in [6,7]] 
    # 20% : modulus is 8 or 9
    data_test  = [item for item in data if item['hash']%10 in [8,9]] 
    return data_train, data_valid, data_test
    
baby_train, baby_valid, baby_test = partition_train_validation_test(baby)

print("Now we have", len(baby_train), "training examples,", len(baby_valid),
      "validation examples, and", len(baby_test), "test examples")


Now we have 96291 training examples, 32244 validation examples, and 32257 test examples


In [9]:
eng_stopwords = set(stopwords.words('english'))
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

def my_tokenize(text):
    # split text into lower-case tokens, removing all-punctuation tokens and stopwords
    tokens = []
    for sentence in sent_tokenize(text):
        #Adds to the array and array with the words in lowercase, we add them if they are not stopwords and there is at least one letter in it
        tokens.extend(x for x in word_tokenize(sentence.lower()) #continues down...
                      if x not in eng_stopwords and any(i.isalpha() for i in x))# This extends the list by adding elements, it is different from append... see https://stackoverflow.com/questions/252703/difference-between-append-vs-extend-list-methods-in-python 
    return tokens

def pos_neg_fraction(text): # We recieve the raw text
    tokens = my_tokenize(text) # We tokenize the text first
    count_pos, count_neg = 0, 0
    for t in tokens:
        if t in positive_words:
            count_pos += 1
        if t in negative_words:
            count_neg += 1
    count_all = len(tokens) # this is because we need to be sure there is no 0 len sentence
    if count_all != 0:
        return count_pos/count_all, count_neg/count_all
    else:
        return 0., 0.
    
pos_example = 'This is a good, great, fantastic, amazing, wonderful, super product!!!'
neg_example = 'This is a bad, atrocious, terrible, dreadful, awful, abysmal product!!!'
print(pos_neg_fraction(pos_example))
print(pos_neg_fraction(neg_example))

(0.8571428571428571, 0.0)
(0.0, 0.8571428571428571)


In [12]:
def dataset_to_matrix(data):
    # data is a lot of text in {} that has reviwer name..comment...date...etc.. but all identified with text labels
    #in that sence data is an unidiminsional array
    # the item "atribute", we added it to the data to identify the sections of it
    return numpy.array([list(pos_neg_fraction(item['reviewText'])) for item in data])
# X_train with two columns and as many rows as there are examples in the data set. 
#The first column contains the fraction of positive words, 
#while the second column contains the fraction of negative words for each example.
X_train = dataset_to_matrix(baby_train)
most_pos, most_neg = numpy.argmax(X_train, axis=0) # find maximum ROW (axis 0).. through aaaaallll the data (OMG!)
# print the example with the highest fraction of positive words:
print("We found a fraction of %f %% positive words for example %d" % 
      (100.*X_train[most_pos, 0], most_pos))
print(baby_train[most_pos])
print("We found a fraction of %f %% negative words for example %d" %
      (100.*X_train[most_neg, 1], most_neg))
print(baby_train[most_neg])

We found a fraction of 100.000000 % positive words for example 10561
{'reviewerID': 'A2FPJGVT01TVVQ', 'asin': 'B000A88JYQ', 'reviewerName': 'S. Broderick "Sondra"', 'helpful': [0, 0], 'reviewText': 'work perfect', 'overall': 5.0, 'summary': 'Five Stars', 'unixReviewTime': 1404691200, 'reviewTime': '07 7, 2014', 'hash': -8116255050029596758}
We found a fraction of 100.000000 % negative words for example 24850
{'reviewerID': 'A1SLEYD29KEUW1', 'asin': 'B000WUD83O', 'reviewerName': 'ABDULLAH AL-FALAH', 'helpful': [0, 0], 'reviewText': 'too noisy', 'overall': 2.0, 'summary': 'Two Stars', 'unixReviewTime': 1404432000, 'reviewTime': '07 4, 2014', 'hash': -8656440585229339815}


In [13]:
def  dataset_to_targets (data):
    return numpy.array([item['overall'] for item in data])

Y_train = dataset_to_targets(baby_train)
print("Our feature matrix is two-dimensional and has shape", X_train.shape) # contains pos,neg fraction
print("Our target vector is one-dimensional and has shape", Y_train.shape) # containd sscore

Our feature matrix is two-dimensional and has shape (96291, 2)
Our target vector is one-dimensional and has shape (96291,)


## Day 1

In [15]:
lreg = LinearRegression().fit(X_train,Y_train)
print("The coefficient for the fpos variable is", lreg.coef_[0])
print("The coefficient for the fneg variable is", lreg.coef_[1])
print("The intercept is", lreg.intercept_)

The coefficient for the fpos variable is 3.25703026268183
The coefficient for the fneg variable is -5.593432938808693
The intercept is 4.004491648095601


In [17]:
#If the review contains 20% positive words (fpos==0.2) 
#but still no negative words (fneg==0), we would expect the following rating:
features = [[0.2, 0]]
expected_rating_A = lreg.predict(features)[0]
print("The expected rating is %f stars" % expected_rating_A)
# we can also compute this explicitly:
expected_rating_B = lreg.intercept_ + 0.2*lreg.coef_[0] + 0*lreg.coef_[1]
print("This is the same as %f stars" % expected_rating_B)
#However, if the review contains no positive words (fpos==0) but 20% negative words (fneg==0.2),
#we expect the following rating:
features = [[0, 0.2]]
expected_rating_A = lreg.predict(features)[0]
print("The expected rating is %f stars" % expected_rating_A)
# we can also compute this explicitly:
expected_rating_B = lreg.intercept_ + 0 * lreg.coef_[0] + 0.2 * lreg.coef_[1]
print("This is the same as %f stars" % expected_rating_B)

The expected rating is 4.655898 stars
This is the same as 4.655898 stars
The expected rating is 2.885805 stars
This is the same as 2.885805 stars


#### - calculate the prediction for 100% pos, and 100% neg review

In [25]:
features_100pos=[[1,0]]
features_100neg=[[0,1]]
expected_rating_pos = lreg.predict(features_100pos)[0] # I think 0 is for linear as in kernel = {‘linear’, ‘rbf’, ‘poly’, ‘sigmoid’, ‘precomputed’}
expected_rating_neg = lreg.predict(features_100neg)[0]
print("The expected rating for 100 pos review is %f stars" % expected_rating_pos)
print("The expected rating  for 100 neg review is %f stars" % expected_rating_neg)
# This model needs a threshold to limit the starts between 0 and 5

The expected rating for 100 pos review is 7.261522 stars
The expected rating  for 100 neg review is -1.588941 stars


#### - Repeat this same process for "Apps for Android" dataset