In [1]:
# Natural Language Processing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
df = pd.read_csv('sentiment.tsv', delimiter = '\t', header=None,names = ["sentiment", "text"])
df.head()

Unnamed: 0,sentiment,text
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,pos,having a vodka tonic and looking forward to go...
2,pos,@ddlovatofans1neg1 Could you follow me please....
3,pos,@jordanknight for once.................. PLEAS...
4,neg,Had a dream about a walk in fast food resturau...


In [2]:
# This Python file uses the following encoding: utf-8
import re

# Hashtags
hash_regex = re.compile(r"#(\w+)")
def hash_repl(match):
    return '__HASH_'+match.group(1).upper()    

# Handels
hndl_regex = re.compile(r"@(\w+)")
def hndl_repl(match):
    return '__HNDL'#_'+match.group(1).upper()    

# URLs
url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")

# Spliting by word boundaries
word_bound_regex = re.compile(r"\W+")

# Repeating words like hurrrryyyyyy
rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);
def rpt_repl(match):
    return match.group(1)+match.group(1)

# Emoticons
emoticons = \
    [('__EMOT_SMILEY',	[':-)', ':)', '(:', '(-:', ] )	,\
        ('__EMOT_LAUGH',		[':-D', ':D', 'X-D', 'XD', 'xD', ] )	,\
        ('__EMOT_LOVE',		['<3', ':\*', ] )	,\
        ('__EMOT_WINK',		[';-)', ';)', ';-D', ';D', '(;', '(-;', ] )	,\
        ('__EMOT_FROWN',		[':-(', ':(', '(:', '(-:', ] )	,\
        ('__EMOT_CRY',		[':,(', ':\'(', ':"(', ':(('] )	,\
    ]

#For emoticon regexes
def escape_paren(arr):
    return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
def regex_union(arr):
    return '(' + '|'.join( arr ) + ')'
emoticons_regex = [ (repl, re.compile(regex_union(escape_paren(regx))) ) \
                    for (repl, regx) in emoticons ]

def processAll(text):

    text = re.sub( hash_regex, hash_repl, text )
    text = re.sub( hndl_regex, hndl_repl, text )
    text = re.sub( url_regex, ' __URL ', text )

    for (repl, regx) in emoticons_regex :
        text = re.sub(regx, ' '+repl+' ', text)

    text = text.replace('\'','')
    text = re.sub( word_bound_regex ,' ', text )
    text = re.sub( rpt_regex, rpt_repl, text )
    return text

In [3]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer


corpus = []
for i in range(0, len(df)):
#     tokens = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    tokens = df['text'][i]
    tokens = tokens.lower()
    tokens = processAll(tokens)
    
    tokenizer = RegexpTokenizer("[\w']+")
    tokens = tokenizer.tokenize(tokens)
    
    tokens = [word for word in tokens if len(word) >= 3]    
    tokens = [word for word in tokens if not word in set(stopwords.words('english'))]
    
#   stemmer = PorterStemmer()
#   stemmer = LancasterStemmer()
#   tokens = [stemmer.stem(word) for word in tokens]    
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    
#     replacer = SpellingReplacer()
#     tokens = [replacer.replace(word) for word in tokens]
        
    tokens = ' '.join(tokens)
    corpus.append(tokens)
    

[nltk_data] Downloading package stopwords to /opt/pynb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /opt/pynb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3),token_pattern=r'\b\w+\b', min_df=5,max_features = 1500)
X = vectorizer.fit_transform(corpus).toarray()

In [5]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features = 1500)
# X = cv.fit_transform(corpus).toarray()

In [6]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer()
# features = vectorizer.fit_transform(corpus)
# X = features.toarray()

In [7]:
i = 1000
j = 10
vectorizer.get_feature_names( )[i:i+10]

[]

In [8]:
feature_names = vectorizer.get_feature_names( )
print(feature_names[::10])

['1neg', '__hndl aww', '__hndl hey', '__hndl sorry', '__hndl wow', 'age', 'always', 'asleep', 'back work', 'best', 'bored', 'btw', 'cant wait', 'chill', 'coming', 'crazy', 'day today', 'dont', 'early', 'ever', 'fan', 'final', 'follow', 'free', 'get back', 'going bed', 'graduation', 'hanging', 'hear', 'homework', 'husband', 'ive', 'last', 'let', 'local', 'love', 'man', 'minute', 'mother', 'name', 'okay', 'party', 'play', 'project', 'ready', 'room', 'second', 'shopping', 'sister', 'soon', 'starting', 'summer', 'talking', 'theyre', 'time', 'trip', 'ugh', 'wake', 'way', 'win', 'world', 'yesterday']


In [9]:
# prop.table(table(df$sentiment))
yes_no_cols = ["sentiment"]
df[yes_no_cols] = df[yes_no_cols] == 'pos'
y = df.iloc[:, 0].values

In [10]:
from sklearn.metrics import accuracy_score as accuracy
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# from sklearn.naive_bayes import GaussianNB
# from sklearn import linear_model
# from sklearn import tree
# from sklearn import svm
# from sklearn import ensemble
# from sklearn import neighbors

kf = KFold(len(y),n_folds=10,shuffle=True)
kf2 = StratifiedKFold(y,n_folds=10,shuffle=True)


results = cross_val_score(LogisticRegression(), X = X, y = y, scoring = "roc_auc", cv = kf)
print("LogisticRegression Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))



LogisticRegression Accuracy: 0.749 (0.025)


In [11]:
from keras.optimizers import SGD
from keras.optimizers import RMSprop
from keras.layers.core import Activation, Dense, Dropout
from keras.models import Sequential

def DefineModel1():    
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=615))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

Using TensorFlow backend.


In [12]:
from keras.layers import Dense, Embedding, LSTM
top_words = 5000

def DefineModel2():    
    # create the model
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=615))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])    
    return model  
    

In [17]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [13]:
from keras.layers import Dense, Embedding, LSTM
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

top_words = 5000

def DefineModel3():    
    # create the model
    model = Sequential()
    model.add(Embedding(top_words, 32, input_length=615))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
    return model  
    

In [14]:
model = DefineModel1()     
model.fit(X, y,epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb403f7f2b0>

In [15]:
model = DefineModel2()     
model.fit(X, y,epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb43b74ddd8>

In [18]:
model = DefineModel3()     
model.fit(X, y,epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb38c16e5c0>

In [19]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score as accuracy
from keras.wrappers.scikit_learn import KerasClassifier
import numpy

# create model
model = KerasClassifier(build_fn=DefineModel3, epochs=50, batch_size=32, verbose=0)

# evaluate using 10-fold cross validation
# kf = KFold(len(y),n_folds=10,shuffle=True)
kf2 = StratifiedKFold(y,n_folds=10,shuffle=True, random_state=7)
results = cross_val_score(model, X, y,scoring = "roc_auc", cv=kf2)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.715 (0.030)


# Summary of roc_auc:
KerasClassifier: Accuracy: 0.713 (0.038)
LogisticRegression Accuracy: 0.751 (0.017)
SGDClassifier Accuracy: 0.723 (0.018)