In [1]:
%pylab
import pandas as pd
import csv
import nltk
from nltk.stem import WordNetLemmatizer
import sys
numpy.set_printoptions(threshold=sys.maxsize)

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
df1 = pd.read_csv('amazon_cells_labelled.txt', sep='\t', names=['review','score'])
print("Loaded %s reviews from amazon." % df1.shape[0])
df2 = pd.read_csv('yelp_labelled.txt', sep='\t', names=['review','score'])
print("Loaded %s reviews from yelp." % df2.shape[0])
df3 = pd.read_csv('imdb_labelled.txt', sep='\t', names=['review','score'], quoting=csv.QUOTE_NONE)
print("Loaded %s reviews from imdb." % df3.shape[0])
df = pd.concat([df1,df2,df3])
df.tail()

Loaded 1000 reviews from amazon.
Loaded 1000 reviews from yelp.
Loaded 1000 reviews from imdb.


Unnamed: 0,review,score
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0
999,All in all its an insult to one's intelligence...,0


In [3]:
df.review = df.review.str.lower()
df.review = df.review.str.replace("[-.,?!`']+", ' ')
df.review = df.review.str.replace(' the ', ' ')
df.review = df.review.str.replace(' and ', ' ')
df.review = df.review.str.replace(' or ', ' ')
df.review = df.review.str.replace(' a ', ' ')
df.review = df.review.str.replace(' an ', ' ')

  


### Lemmatization with NLTK
https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [4]:
def lemmatize(sentence, lemmatizer):
    word_list = nltk.word_tokenize(sentence)
    return [lemmatizer.lemmatize(word) for word in word_list]

lemmatizer = WordNetLemmatizer()
df.review = df.review.apply(lemmatize, args=[lemmatizer])
print(df.review.iloc[0])

['so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'u', 'unless', 'i', 'go', 'by', 'converter']


In [5]:
df1 = df.iloc[:1000]
df2 = df.iloc[1000:2000]
df3 = df.iloc[2000:]

df_list = [df1, df2, df3]

def split_train_test(df):
    return (pd.concat([df[df["score"] == 0].iloc[:400],df[df["score"] == 1].iloc[:400]]),
            pd.concat([df[df["score"] == 0].iloc[400:],df[df["score"] == 1].iloc[400:]]))

train_list = []
test_list = []

for dfi in df_list:
    train, test = split_train_test(dfi)
    train_list.append(train)
    test_list.append(test)
    
df_train = pd.concat(train_list).reset_index(drop=True)
df_test = pd.concat(test_list).reset_index(drop=True)
print('We split the data into %s train reviews and %s test reviews.' % (df_train.shape[0], df_test.shape[0]))

We split the data into 2400 train reviews and 600 test reviews.


In [6]:
count = {}

# First pass to put every word of train reviews in dictionary
for row in df_train.review.array:
    for word in row:
        if word not in count:
            count[word] = 0

# Second pass to count words of train and test reviews
feat_matrix = []
for row in df_train.review.array:
    feat_vec = []
    for word in count.keys():
        if word in row:
            feat_vec.append(row.count(word))
        else:
            feat_vec.append(0)
    feat_matrix.append(feat_vec)
    
feat_matrix_train = np.array(feat_matrix)

feat_matrix = []
for row in df_test.review.array:
    feat_vec = []
    for word in count.keys():
        if word in row:
            feat_vec.append(row.count(word))
        else:
            feat_vec.append(0)
    feat_matrix.append(feat_vec)
    
feat_matrix_test = np.array(feat_matrix)

df_train = pd.concat([df_train, pd.DataFrame(feat_matrix_train)], axis=1)
df_test = pd.concat([df_test, pd.DataFrame(feat_matrix_test)], axis=1)
print("Here is a vector:")
print(feat_matrix_train[666])
print("\nand here is another vector:")
print(feat_matrix_train[420])

Here is a vector:
[0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 

### Post-processing
We will plot the different transformations suggested in the guidelines.

In [7]:
x = np.linspace(0,10,100)
y = np.log10(x+1)

plt.plot(x,y)
plt.title('Log-normalization')
plt.show()

### Naive Bayes Classifier
This is a conditional probability classifier: we want to estimate the following

$$\mathbb{P}[Y=1|X=x] = \frac{\mathbb{P}[Y=1]\mathbb{P}[X=x|Y=1]}{\mathbb{P}[X=x]}$$

Under the naive conditional independance assumption, we can focus on estimating the following:

$$\mathbb{P}[Y=1]\prod_{i=1}^p\mathbb{P}[X_i=x_i|Y=1]$$

Let us first compute $\mathbb{P}[Y=1]$:

In [8]:
probY_1 = df_train.score.value_counts(normalize=True).loc[1]
print("The prior probability that Y=1 is "+str(probY_1))

The prior probability that Y=1 is 0.5


In [9]:
probXgivenYmap = []

for col in df_train.columns[2:]:
    probXjgivenY = df_train[col][df_train["score"] == 1].value_counts(normalize=True).to_dict()
    probXgivenYmap.append(probXjgivenY)

In [10]:
for i in range(len(probXgivenYmap)):
    df_test['P'+str(i)] = df_test.iloc[:,i+2].map(probXgivenYmap[i])

df_test.head(3)

Unnamed: 0,review,score,0,1,2,3,4,5,6,7,...,P4237,P4238,P4239,P4240,P4241,P4242,P4243,P4244,P4245,P4246
0,"[doesn, t, work]",0,0,0,0,0,0,0,0,0,...,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167
1,"[unfortunately, it, did, not, work]",0,0,0,0,0,0,0,0,0,...,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167
2,"[all, three, broke, within, two, month, of, use]",0,0,0,0,0,0,0,0,0,...,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167,0.999167


In [11]:
probXgivenY = np.array(df_test[["P"+str(i) for i in range(len(probXgivenYmap))]])
probXgivenY.shape

(600, 4247)

In [12]:
probXgivenY = np.prod(probXgivenY, axis=1)
probXgivenY = probXgivenY * probY_1
probXgivenY.shape

(600,)

In [13]:
probXgivenY >= 0.5

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [14]:
print(probXgivenY)

[1.12107937e-10            nan 1.16173164e-22            nan
            nan 1.88585540e-32            nan            nan
 5.58466528e-17            nan 1.45822900e-25            nan
            nan            nan            nan 1.08940997e-14
 2.41180259e-15            nan            nan 1.35332324e-12
            nan 3.32386626e-26 6.03953571e-12            nan
            nan 6.35936315e-17            nan 7.84070693e-15
 8.54891410e-12 2.68185809e-23            nan            nan
 5.87974047e-15            nan            nan 1.15414551e-25
            nan 1.73831220e-30 1.49847598e-10            nan
            nan 4.10317996e-16            nan 6.73618826e-10
            nan 3.34665949e-24            nan            nan
 1.56626602e-14            nan 1.65641711e-29            nan
            nan 3.92687066e-13            nan 1.32953644e-29
            nan 3.15370330e-10 7.07959486e-27            nan
            nan            nan 4.37915462e-35            nan
            nan         