In [1]:
import pandas as pd

filepath_dict = {'yelp':   'sentiment labelled sentences/yelp_labelled.txt',
                 'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
                 'imdb':   'sentiment labelled sentences/imdb_labelled.txt'}

In [2]:
filepath_dict

{'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
 'imdb': 'sentiment labelled sentences/imdb_labelled.txt',
 'yelp': 'sentiment labelled sentences/yelp_labelled.txt'}

In [3]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

In [20]:
import numpy as np
df = pd.concat(df_list)


In [43]:
var = 'source'
col_list = [var]+[cols for cols in df.columns if cols != var]


In [44]:
col_list

['source', 'sentence', 'label']

In [45]:
df = df[col_list]

In [46]:
df.head()

Unnamed: 0,source,sentence,label
0,amazon,So there is no way for me to plug it in here i...,0
1,amazon,"Good case, Excellent value.",1
2,amazon,Great for the jawbone.,1
3,amazon,Tied to charger for conversations lasting more...,0
4,amazon,The mic is great.,1


#### we  can use the CountVectorizer provided by the scikit-learn library to vectorize sentences.

In [130]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
vectorizer = CountVectorizer(min_df=0, lowercase=False)
df_amazon = df[df.source == 'amazon']

sentence_amazon = df_amazon.sentence.values
label_amazon = df_amazon.label.values

sentence_vector = vectorizer.fit_transform(sentence_amazon)
sentence_vector.shape

(1000, 2167)

In [131]:
X_train,X_test,y_train,y_test = train_test_split(sentence_vector,label_amazon,test_size=0.2,random_state=1)

In [132]:
X_train[:5]

<5x2167 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [133]:
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(800, 2167) (200, 2167)
(800,) (200,)


In [134]:
from sklearn.linear_model import LogisticRegression
classifier  = LogisticRegression()
classifier.fit(X_train,y_train)
print('Accuracy using Countvectorizer in scikitlearn  : ',classifier.score(X_test,y_test))

Accuracy using Countvectorizer in scikitlearn  :  0.775


# Using NLTK

In [136]:
import nltk
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer

#### CountVectorizer can lowercase letters, disregard punctuation and stopwords, but it can't LEMMATIZE or STEM


#### Initialize a CountVectorizer object: count_vectorizer
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

#### Min_df¶
**Min_df ignores terms that have a document frequency (presence in % of documents) strictly lower than the given threshold. For example, Min_df=0.66 requires that a term appear in 66% of the docuemnts for it to be considered part of the vocabulary.

Sometimes min_df is used to limit the vocabulary size, so it learns only those terms that appear in at least 10%, 20%, etc. of the documents.


#### Max_df
When building the vocabulary, it ignores terms that have a document frequency strictly higher than the given threshold. This could be used to exclude terms that are too frequent and are unlikely to help predict the label. For example, by analyzing reviews on the movie Lion King, the term 'Lion' might appear in 90% of the reviews (documents), in which case, we could consider establishing Max_df=0.89

In [152]:
vectors = CountVectorizer()
wordnet = WordNetLemmatizer()
sentence_filter = [wordnet.lemmatize(word) for word in sentence_amazon]
sentence_vector1 =  vectors.fit_transform(sentence_filter)
sentence_vector1.shape

(1000, 1847)

In [153]:
sentence_filter[:5]

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.']

In [154]:
X_train,X_test,y_train,y_test = train_test_split(sentence_vector1,label_amazon,test_size=0.2,random_state=1)


In [155]:
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(800, 1847) (200, 1847)
(800,) (200,)


In [156]:
from sklearn.linear_model import LogisticRegression
classifier  = LogisticRegression()
classifier.fit(X_train,y_train)
print('Accuracy using NLTK: ',classifier.score(X_test,y_test))

Accuracy using NLTK:  0.805


# Using KERAS

In [158]:
import keras
from keras import Sequential
from keras.layers import Dense
input_dim = X_train.shape[1]  # Number of features
model = Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(activation="relu", input_dim=input_dim, units=6, kernel_initializer="uniform"))

# Adding the second hidden layer
model.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))

# Adding the output layer
model.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))


# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


In [160]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 6)                 11088     
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 11,137
Trainable params: 11,137
Non-trainable params: 0
_________________________________________________________________


In [168]:
#Fitting the ANN to the Training set
model.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)

  


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1b007da0>

In [169]:
predictions = model.predict(X_test)

In [170]:
y_pred = []
for i in predictions:
    if i > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [171]:
from sklearn.metrics import accuracy_score
print('Accuracy Score : ',accuracy_score(y_test,y_pred))

Accuracy Score :  0.83


In [172]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 1.0000


In [173]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.8300
