In [6]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import string
import io
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/agustin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature engineering

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

In [None]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

In [None]:
tweets = pd.read_csv("train.csv", usecols=['id','text', 'target'])
test = pd.read_csv("test.csv")

In [None]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

In [None]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)

tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)

tweets_metrics.head()

# Split de features

In [None]:
data_cols = ['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','mentions_count','hashtags_count','longest_word_length_without_stopwords']
basic_data_cols = ['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','longest_word_length_without_stopwords']
x_train, x_test, y_train, y_test = \
train_test_split(tweets_metrics[data_cols], tweets_metrics['target'], test_size = 0.25, random_state = 123)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
vectorizer = TfidfVectorizer()
tfidf_lgbm = vectorizer.fit_transform(tweets_metrics.loc[:, 'text'])
array = tfidf_lgbm.todense()

In [None]:
df = pd.DataFrame(array)
df['output'] = tweets['target']
basic_features = tweets_metrics[['length','avg_word_length','amount_of_words','amount_of_unique_words','sentiment','stopwords_count','punctuation_count','mentions_count','hashtags_count','longest_word_length_without_stopwords']]
tfidf_features = df.merge(basic_features, left_index = True, right_index = True)
tfidf_features

In [None]:
features = tfidf_features.columns.drop('output')
x = tfidf_features.loc[:, features].values
y = tfidf_features.loc[:, 'output'].values

from sklearn.model_selection import train_test_split
x_tfidf_train, x_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

sc = StandardScaler()
x_tfidf_train = sc.fit_transform(x_tfidf_train)
x_tfidf_test = sc.transform(x_tfidf_test)
print(x_tfidf_train.shape, x_tfidf_test.shape)

### Split de TF-IDF

In [None]:
x_text_train, x_text_test, y_text_train, y_text_test = \
train_test_split(tweets_metrics['text'], tweets_metrics['target'], test_size = 0.25, random_state = 123)

In [None]:
tfid_vectorizer = TfidfVectorizer()
train_vectors = tfid_vectorizer.fit_transform(x_text_train)
test_vectors = tfid_vectorizer.transform(x_text_test)
print(train_vectors.shape, test_vectors.shape)

### Split count vectorizer

In [None]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
count_train = count_vectorizer.fit_transform(x_text_train)
count_test = count_vectorizer.transform(x_text_test)
print(count_train.shape, count_test.shape, y_text_train.shape,y_text_test.shape )

### Split de Hash vectorizer

In [None]:
hash_vectorizer = HashingVectorizer()
hash_train = hash_vectorizer.fit_transform(x_text_train)
hash_test = hash_vectorizer.fit_transform(x_text_test)
sc = StandardScaler(with_mean=False)
hash_train = sc.fit_transform(hash_train)
hash_test = sc.transform(hash_test)

# KNN : baseline
 *En principio usando un bootstrap del set de entrenamiento para medir accuracy. Se utilizo un 25% del set para entrenar y el 75% restante para la predicción. Se utilizó la representación TF-IDF para el entrenamiento ya que KNN funciona con valores numericos.*
 
Methods

* fit(self, X, y) Fit the model using X as training data and y as target values

* get_params(self[, deep]) Get parameters for this estimator.

* kneighbors(self[, X, n_neighbors, …]) Finds the K-neighbors of a point.

* kneighbors_graph(self[, X, n_neighbors, mode]) Computes the (weighted) graph of k-Neighbors for points in X

* predict(self, X) Predict the class labels for the provided data.

* predict_proba(self, X) Return probability estimates for the test data X.

* score(self, X, y[, sample_weight]) Return the mean accuracy on the given test data and labels.

* set_params(self, \*\*params) Set the parameters of this estimator.

### Basic features

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train, y_train)
max = knn.score(x_test, y_test)
actual = max
optimo = 1

for i in range(2, 100):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(x_train, y_train)
    actual = knn.score(x_test, y_test)
    if max < actual:
        optimo = i 
        max = actual

print(optimo, max)

In [None]:
knn = KNeighborsClassifier(n_neighbors=43)
all_accuracies = cross_val_score(estimator=knn, X=x_train, y=y_train, cv=40)
print(all_accuracies.mean())

### TF-IDF

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_vectors, y_text_train)
max = knn.score(test_vectors, y_text_test)
actual = max
optimo = 1

for i in range(2, 100):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(train_vectors, y_text_train)
    actual = knn.score(test_vectors, y_text_test)
    if max < actual:
        optimo = i 
        max = actual

print(optimo, max) #0.7961269499731038 con 49

### KNN con CountVectorizer

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(count_train, y_text_train)
max = knn.score(count_test, y_text_test)
actual = max
optimo = 1

for i in range(2, 100):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(count_train, y_text_train)
    actual = knn.score(count_test, y_text_test)
    if max < actual:
        optimo = i 
        max = actual

print(optimo, max) #0.6778749159381304 con 1 (basura)

# XGBoost

### Basic features

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

#acc = 0.7305002689618074(objective ='binary:logistic', colsample_bytree = 0.6, learning_rate = 0.01, max_depth = 35,alpha = 0.5, n_estimators = 140) 
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.6, learning_rate = 0.005,
                max_depth = 35,alpha = 0.5, n_estimators = 140) 
xg_reg.fit(x_train,y_train) 
y_pred = xg_reg.predict(x_test)

for i in range (0, len(y_pred)):
    if y_pred[i] >= 0.5:       
        y_pred[i] = 1 
    else:  
        y_pred[i]=0
        
accuracy = accuracy_score(y_pred,y_test)
print(accuracy)

In [None]:
kfold = KFold(n_splits=10)
results = cross_val_score(xg_reg, x_train, y_train, cv=kfold)
results

import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [12, 7]
xgb.plot_importance(xg_reg)

### TF-IDF

Busqueda de parametros

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = dict(learning_rate = np.arange(0.001,0.1,0.005),
                              n_estimators = np.arange(15,300,15),
                              scale_pos_weight = np.arange(2,6,1),
                              max_depth = np.arange(15,40,2),min_child_weight= np.arange(1,10,1),
                              gamma = np.arange(0,1,0.1), alpha= np.arange(0.1,1,0.1),
                              subsample = np.arange(0,1,0.1), colsample_bytree = np.arange(0.5,0.8,0.05),
                              colsample_bylevel = np.arange(0.6,0.91,0.05))
xgb_model = xgb.XGBClassifier()

grid = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=20, n_jobs=1)
grid_result = grid.fit(train_vectors,y_text_train)
params_xgb_tfidf = grid_result.best_params_
print("Best parameters: ", params_xgb_tfidf)

In [None]:
xg_reg = xgb.XGBClassifier(**params_xgb_tfidf)
xg_reg.fit(train_vectors,y_text_train)
preds = xg_reg.predict(test_vectors)
for i in range (0, len(preds)):
    if preds[i] >= 0.5:       
        preds[i] = 1 
    else:  
        preds[i] = 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(preds,y_text_test)
print(accuracy)

### CountVectorizer

Busqueda de parametros

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = dict(learning_rate = np.arange(0.01,0.5,0.02),
                              n_estimators = np.arange(15,300,15),
                              scale_pos_weight = np.arange(2,6,1),
                              max_depth = np.arange(15,40,2),min_child_weight= np.arange(1,10,1),
                              gamma = np.arange(0,0.5,0.1), alpha= np.arange(0.1,1,0.1),
                              subsample = np.arange(0.6,1,0.1), colsample_bytree = np.arange(0.5,0.8,0.05),
                              colsample_bylevel = np.arange(0.6,0.91,0.05))
classifier = xgb.XGBClassifier()

grid = RandomizedSearchCV(estimator=classifier, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=20, n_jobs=1)
grid_result = grid.fit(count_train,y_text_train)
params_xgb_count = grid_result.best_params_
print("Best parameters: ", params_xgb_count)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# Mejores parametros encontrados
# (objective ='reg:logistic', colsample_bytree = 0.45, colsample_bylevel = 0.4, learning_rate = 0.05, max_depth = 25, min_child_weight = 1.1, alpha = 0.5, gamma =  0.4, n_estimators = 210)
xg_reg = xgb.XGBRegressor(**params_xgb_count)
xg_reg.fit(count_train,y_train) 
y_pred = xg_reg.predict(count_test)

for i in range (0, len(y_pred)):
    if y_pred[i] >= 0.5:       
        y_pred[i] = 1 
    else:  
        y_pred[i]=0
        
accuracy = accuracy_score(y_pred,y_test)
print(accuracy)

### Hashing vectorizer

In [None]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:logistic', 
                colsample_bytree = 0.45, colsample_bylevel = 0.4, learning_rate = 0.05,
                max_depth = 25, min_child_weight = 1.1, alpha = 0.5, gamma =  0.4, n_estimators = 210)
xg_reg.fit(hash_train, y_train)
y_pred = xg_reg.predict(hash_test)

for i in range (0, len(y_pred)):
    if y_pred[i] >= 0.5:       
        y_pred[i] = 1 
    else:  
        y_pred[i]=0
        
accuracy = accuracy_score(y_pred,y_test)
print(accuracy)

# LSTM

### CHEQUEAR ESTO

In [None]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

In [None]:
tweets_metrics['text'] = tweets_metrics['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets_metrics['text'] = tweets_metrics['text'].apply(lambda x: x.lower())
tweets_metrics['text'] = tweets_metrics['text'].str.split()
tweets_metrics['text'] = tweets_metrics['text'].apply(remove_stopword)
tweets_metrics['text'] = tweets_metrics['text'].apply(stemm)
tweets_metrics.head()

In [None]:
test_metrics['text'] = test_metrics['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
test_metrics['text'] = test_metrics['text'].apply(lambda x: x.lower())
test_metrics['text'] = test_metrics['text'].str.split()
test_metrics['text'] = test_metrics['text'].apply(remove_stopword)
test_metrics['text'] = test_metrics['text'].apply(stemm)
test_metrics.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional, Concatenate, Flatten, Attention
from keras.models import Model,Sequential
from keras.callbacks import EarlyStopping

In [None]:
max_words = 10000
max_len = 140

In [None]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = Bidirectional(LSTM(256,return_sequences=True))(layer)
    layer = Bidirectional(LSTM(4))(layer)
    layer = Dense(64)(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1)(layer)
    layer = Activation('sigmoid')(layer)

    model = Model(inputs=inputs,outputs=layer)
    return model
model = RNN()
model.summary()

### Con Features - 75% del set


In [None]:
#Multiples parametros - Entrenar con 75% del set
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

X_train = tweets_metrics.iloc[:,4:]
X_train["text"] = tweets_metrics["text"]

Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train["text"])

sequences = tok.texts_to_sequences(X_train["text"])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

test_sequences = tok.texts_to_sequences(X_test["text"])
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

features = StandardScaler()
X_train_features = features.fit_transform(X_train.iloc[:,:-1])
X_test_features = features.transform(X_test.iloc[:,:-1])

#Guarda el mejor
weight_path="Checkpoints/LSTM.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=1)
callbacks = [checkpoint, early_stopping]

#Ejecuta el fit
model = RNN()
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit([sequences_matrix,X_train_features],Y_train,batch_size=24,epochs=10,validation_split=0.2,callbacks=callbacks,verbose=1)

#Carga el mejor y evalua
model.load_weights(weight_path)
accr = model.evaluate([test_sequences_matrix,X_test_features],Y_test)

print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

### Sin Features - 75% del set


In [None]:
#Sin features
#Preparar datos para test sin features
X_train = tweets_metrics.text
Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

#Comentar para generar submit
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)

sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

#Guarda el mejor
weight_path="Checkpoints/LSTM_No_Features.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=1)
callbacks = [checkpoint, early_stopping]

#Ejecuta el fit
model = RNN()
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=71,epochs=10,validation_split=0.2,callbacks=callbacks,verbose=1)

#Carga el mejor y evalua
model.load_weights(weight_path)

#Comentar para generar submit - Sin features
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

# Logistic Regression

### CountVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)

model = LogisticRegression()

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word')
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)


### TF-IDF

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target

x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.20)

model = LogisticRegression(solver="newton-cg")

from sklearn.feature_extraction.text import TfidfVectorizer
    
vectorizer = TfidfVectorizer(analyzer="word", smooth_idf = False)
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)

### Hashing Vectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target

x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.20)

model = LogisticRegression(solver="newton-cg")

from sklearn.feature_extraction.text import HashingVectorizer
    
vectorizer = HashingVectorizer(analyzer="word",n_features=60000)
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)

# SVM

In [None]:
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
X_train = tweets_metrics.text
Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

#Comentar para generar submit
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

### CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word')
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

#Ejecuta el fit
model = svm.SVC(kernel='linear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(analyzer="word", smooth_idf = False)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

#Ejecuta el fit
model = svm.SVC(kernel='linear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

### Hashing Vectorizer

In [None]:
vectorizer = HashingVectorizer(analyzer="word",n_features=800000)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

#Ejecuta el fit
model = svm.SVC(kernel='linear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

# MAURO WIP

# Keras

### CNN

In [None]:
from keras.models import Sequential
from keras import layers
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train,
                    epochs=2,
                    verbose=1,
                    #validation_data=(x_test, y_test),
                    batch_size=65)
#loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
#print("Training Accuracy: {:.4f}".format(accuracy))
#loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
#print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:

from keras.models import Sequential
from keras.layers import Dropout, Flatten
from keras import layers
#embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Conv1D(32, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor = 'val_loss', patience = 1)

history = model.fit(x_train, y_train,
                    epochs=15,
                    verbose=1,
                    #validation_data=(x_test, y_test),
                    validation_split=0.1,
                    batch_size=88,
                    callbacks = [callback])

### K fold CV

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dropout, Flatten
from keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.preprocessing.text import Tokenizer
callback = EarlyStopping(monitor = 'val_loss', patience = 1, verbose=1)

x_train = tweets_metrics['text'].values
y_train = tweets_metrics['target'].values

# Main settings
epochs = 15
embedding_dim = 100
maxlen = 140

# Train-test split
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
#x_test = tokenizer.texts_to_sequences(x_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
#x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

embedding_matrix = create_embedding_matrix('glove.twitter.27B.100d.txt',tokenizer.word_index, embedding_dim)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 128, 144],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen],
                      batch_size = [45,65,76,88])

model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, validation_split=0.1,
                            verbose=1)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=2, n_iter=5, n_jobs=1)

grid_result = grid.fit(x_train, y_train, callbacks=[callback])

# Evaluate testing set
#test_accuracy = grid.score(x_test, y_test)

# Save and evaluate results
s = ('Best Accuracy : {:.4f}\n{}\n\n\n')
output_string = s.format(
            grid_result.best_score_,
            grid_result.best_params_)
            
print(output_string)