In [None]:
# text preprocessing 
lem = WordNetLemmatizer()

def _clean(text):
    ## lower case 
    cleaned = text.lower()
    
    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)
    
    ## remove stopwords 
    words = cleaned.split()
    stopword_lists = stopwords.words("english")
    cleaned = [word for word in words if word not in stopword_lists]
    
    ## normalization - lemmatization
    cleaned = [lem.lemmatize(word, "v") for word in cleaned]
    cleaned = [lem.lemmatize(word, "n") for word in cleaned]
    
    ## join 
    cleaned = " ".join(cleaned)
    return cleaned

data["cleaned"] = data["text"].apply(lambda x : clean_text(x))
data.head()

In [None]:
_clean("I will by playing a game today !! ")

In [None]:
data["cleaned"] = data["text"].apply(_clean)
data.head()

In [None]:
## feature engineering 

## meta features 

data["word_count"] = data["text"].apply(lambda x : len(x.split()))
data["word_count_cleand"] = data["cleaned"].apply(lambda x : len(x.split()))

data["char_count"] = data["text"].apply(lambda x : len(x))
data["char_count_without_spaces"] = data["text"].apply(lambda x : len(x.replace(" ","")))

data["num_dig"] = data["text"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()])                         )

In [None]:
data.head()

In [None]:
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}
import nltk
def pos_check(txt, family):
    tags = nltk.pos_tag(nltk.word_tokenize(txt))
    count = 0
    for tag in tags:
        tag = tag[1]
        if tag in pos_dic[family]:
            count += 1 
    return count

# pos_check("They are playing in the ground", "verb")

data["noun_count"] = data["text"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["text"].apply(lambda x : pos_check(x, "verb"))

In [None]:
data.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cvz = CountVectorizer()
cvz.fit(data["cleaned"].values)
count_vectors = cvz.transform(data["cleaned"].values)

In [None]:
count_vectors

In [None]:
word_tfidf =TfidfVectorizer(max_features=500)
word_tfidf.fit(data["cleaned"].values)
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [None]:
ngram_tfidf =TfidfVectorizer(max_features=500, ngram_range=(1,2))
ngram_tfidf.fit(data["cleaned"].values)
ngram_tfidf_tfidf = ngram_tfidf.transform(data["cleaned"].values)

In [None]:
char_tfidf =TfidfVectorizer(max_features=500, analyzer="char")
char_tfidf.fit(data["cleaned"].values)
char_tfidf_tfidf = char_tfidf.transform(data["cleaned"].values)

In [None]:
tfidf = dict(zip(word_tfidf.get_feature_names(), word_tfidf.idf_))
tfidf_idf = pd.DataFrame.from_dict(tfidf, orient="index")
tfidf_idf.columns=["word_tfidf"]
tfidf_idf.head()

In [None]:
from scipy.sparse import hstack, csr_matrix

meta_features = ['word_count', 'word_count_cleand',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']

feature_set1 = data[meta_features]

train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")
train

In [None]:
from sklearn.preprocessing import LabelEncoder 

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [None]:
target

In [None]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train, target)

In [None]:
train_x.shape

In [None]:
val_x.shape

In [None]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm 
from sklearn import ensemble
from sklearn.metrics import accuracy_score

In [None]:
model = naive_bayes.MultinomialNB()
model.fit(train_x, train_y)
preds = model.predict(val_x)
accuracy_score(preds, val_y)

In [None]:
model = LogisticRegression()
model.fit(train_x, train_y)
preds = model.predict(val_x)
accuracy_score(preds, val_y)

In [None]:
model = svm.SVC()
model.fit(train_x, train_y)
preds = model.predict(val_x)
accuracy_score(preds, val_y)

In [None]:
model = ensemble.ExtraTreesClassifier()
model.fit(train_x, train_y)
preds = model.predict(val_x)
accuracy_score(preds, val_y)

In [None]:
import xgboost

model = xgboost.XGBClassifier()
model.fit(trainx, trainy)
preds = model.predict(valx)
accuracy_score(preds, valy)

In [None]:
import numpy as np

embeddings_index = {}
for i, line in enumerate(open("pretrained.vec", encoding="utf8")):
    if i == 0:
        continue
    value = line.split()
    embeddings_index[value[0]] = np.array(values[1:], dtype="float32")

In [None]:
from keras.preprocessing import text, sequence 


token = text.Tokenizer()
token.fit_on_texts(data["text"])
word_index = token.word_index

trainx, valx, trainy, valy = train_test_split(data["text"], target)

trainx = sequence.pad_sequences(token.texts_to_sequences(trainx), maxlen=70)
valx = sequence.pad_sequences(token.texts_to_sequences(valx), maxlen=70)

embedding_matrix = np.zeros((len(word_index)+1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_val, valid_y):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_val)
    predictions = predictions.argmax(axis=-1)
    return accuracy_score(predictions, valid_y)

In [None]:
from keras import layers, models, optimizers 

def create_cnn():
    
    input_layer = layers.Input((70, ))
    
    embedding_layer = layers.Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)
    
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)
    
    output_layer = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer = layers.Dropout(0.25)(output_layer)    
    output_layer = layers.Dense(1, activation="sigmoid")(output_layer)
    
    model = models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=optimizers.Adam(), loss="binary_crossentropy")
    return model

In [None]:
classifier = create_cnn()
train_model(classifier, trainx, trainy, valx, valy)

In [None]:
## feature engineering 

## meta features
data["cleaned"] = data["cleaned"].fillna("")

data["digit_count"] = data["text"].apply(lambda x : sum([1 if w.isdigit() else 0 for w in x.split()]))
data["upper_count"] = data["text"].apply(lambda x : sum([1 if w.isupper() else 0 for w in x.split()]))
data["word_count"] = data["cleaned"].apply(lambda x: len(x.split()))
data["char_count"] = data["cleaned"].apply(lambda x: len(x))
data["char_nospace_count"] = data["cleaned"].apply(lambda x: len(x.replace(" ","")))

data

In [None]:
## nlp based features 

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

def pos_check(x, flag):
    tags = nltk.pos_tag(nltk.word_tokenize(x))
    count = 0
    for tag in tags:
        tag = tag[1]
        if tag in pos_dic[flag]:
            count += 1
    return count

data['noun_count'] = data['cleaned'].apply(lambda x: pos_check(x, 'noun'))
data['verb_count'] = data['cleaned'].apply(lambda x: pos_check(x, 'verb'))
data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cvz = CountVectorizer(analyzer='word') 
cvz.fit(data["cleaned"].values)
count_vectors = cvz.transform(data["cleaned"].values)

In [None]:
count_vectors

In [None]:
word_tfidf = TfidfVectorizer(analyzer='word') 
word_tfidf.fit(data["cleaned"].values)
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

ngram_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,3)) 
ngram_tfidf.fit(data["cleaned"].values)
ngarm_vectors_tfidf = ngram_tfidf.transform(data["cleaned"].values)

char_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,3)) 
char_tfidf.fit(data["cleaned"].values)
char_vectors_tfidf = char_tfidf.transform(data["cleaned"].values)

In [None]:
tfidf = dict(zip(word_tfidf.get_feature_names(), word_tfidf.idf_))
tfidf = pd.DataFrame(columns=['title_word_tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['title_word_tfidf']
tfidf.sort_values(by=['title_word_tfidf'], ascending=False).head()

In [None]:
tfidf.sort_values(by=['title_word_tfidf'], ascending=False).tail()

In [None]:
from scipy.sparse import hstack, csr_matrix

meta_features = ['digit_count', 'upper_count', 'word_count', 'char_count', 'char_nospace_count', 'noun_count', 'verb_count']
feature_set1 = data[meta_features]
train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], 'csr')

In [None]:
train

In [None]:
from sklearn.preprocessing import LabelEncoder

target = data['label'].values
target = LabelEncoder().fit_transform(target)

In [None]:
from sklearn.model_selection import train_test_split
trainx, valx, trainy, valy = train_test_split(train, target)

In [None]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import ensemble
# import xgboost
from sklearn.metrics import accuracy_score

In [None]:
## NaiveBayes
model = naive_bayes.MultinomialNB()
model.fit(trainx, trainy)
preds = model.predict(valx)
accuracy_score(preds, valy)

In [None]:
model = LogisticRegression()
model.fit(trainx, trainy)
preds = model.predict(valx)
accuracy_score(preds, valy)

In [None]:
model = svm.SVC()
model.fit(trainx, trainy)
preds = model.predict(valx)
accuracy_score(preds, valy)

In [None]:
model = ensemble.ExtraTreesClassifier()
model.fit(trainx, trainy)
preds = model.predict(valx)
accuracy_score(preds, valy)

In [None]:
# load the pre-trained word-embedding vectors
import numpy as np 
embeddings_index = {}
for i, line in enumerate(open('pretrained.vec', encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

In [None]:
# create a tokenizer 
from keras.preprocessing import text, sequence
from sklearn import model_selection 

token = text.Tokenizer()
token.fit_on_texts(data['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data['text'], target)
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
train_x.shape

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
from keras import layers , models , optimizers
from sklearn import metrics

def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)