In [None]:
import numpy as np
import pandas as pd
import string
import xgboost as xgb
import io
import nltk
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

In [None]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("JJ")])

def get_nouns(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("NN")])

def get_verbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("VB")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("RB")])

In [None]:
tweets = pd.read_csv("train.csv", usecols=['id','text', 'target'])
test = pd.read_csv("test.csv")

In [None]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

### Fichur Inginierin


In [None]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)

tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
tweets_metrics['stopword_word_ratio'] = tweets_metrics['stopwords_count'] / tweets_metrics['amount_of_words']

tweets_metrics['adjectives_count'] = tweets_metrics['text'].apply(get_adjectives)
tweets_metrics['nouns_count'] = tweets_metrics['text'].apply(get_nouns)
tweets_metrics['verbs_count'] = tweets_metrics['text'].apply(get_verbs)
tweets_metrics['adverbs_count'] = tweets_metrics['text'].apply(get_adverbs)

tweets_metrics.head()

In [None]:
test_metrics = test[['id','text']]
test_metrics['text_without_stopwords'] = test_metrics['text'].str.split()
test_metrics['text_without_stopwords'] = test_metrics['text_without_stopwords'].apply(remove_stopword)

test_metrics['length'] = test['text'].apply(lambda x: len(x))
test_metrics['avg_word_length'] = test_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
test_metrics['amount_of_words'] = test_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = test_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
test_metrics['amount_of_unique_words'] = unique_words_by_tweet
test_metrics['sentiment'] = test_metrics['text'].apply(lambda x: return_sia_compound_values(x))
test_metrics['stopwords_count'] = test_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
test_metrics['punctuation_count'] = test_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = test_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
test_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = test_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
test_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
test_metrics['longest_word_length_without_stopwords'] = test_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
test_metrics['stopword_word_ratio'] = test_metrics['stopwords_count'] / test_metrics['amount_of_words']

test_metrics['adjectives_count'] = test_metrics['text'].apply(get_adjectives)
test_metrics['nouns_count'] = test_metrics['text'].apply(get_nouns)
test_metrics['verbs_count'] = test_metrics['text'].apply(get_verbs)
test_metrics['adverbs_count'] = test_metrics['text'].apply(get_adverbs)

test_metrics.head()

## LSTM

In [None]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

In [None]:
tweets_metrics['text'] = tweets_metrics['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets_metrics['text'] = tweets_metrics['text'].apply(lambda x: x.lower())
tweets_metrics['text'] = tweets_metrics['text'].str.split()
tweets_metrics['text'] = tweets_metrics['text'].apply(remove_stopword)
tweets_metrics['text'] = tweets_metrics['text'].apply(stemm)
tweets_metrics.head()

In [None]:
test_metrics['text'] = test_metrics['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
test_metrics['text'] = test_metrics['text'].apply(lambda x: x.lower())
test_metrics['text'] = test_metrics['text'].str.split()
test_metrics['text'] = test_metrics['text'].apply(remove_stopword)
test_metrics['text'] = test_metrics['text'].apply(stemm)
test_metrics.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional, Concatenate, Flatten
from keras.models import Model,Sequential
from keras.callbacks import EarlyStopping

In [None]:
max_words = 10000
max_len = 100

In [None]:
#layer = LSTM(256,return_sequences=True)(layer)
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = Bidirectional(LSTM(4,return_sequences=True))(layer)
    layer = Bidirectional(LSTM(4))(layer)
    layer = Dense(64)(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1)(layer)
    layer = Activation('sigmoid')(layer)

    model = Model(inputs=inputs,outputs=layer)
    return model
model = RNN()
model.summary()

In [None]:
#Multiples parametros - Entrenar con 75% del set
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

X_train = tweets_metrics.iloc[:,4:]
X_train["text"] = tweets_metrics["text"]

Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train["text"])

sequences = tok.texts_to_sequences(X_train["text"])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

test_sequences = tok.texts_to_sequences(X_test["text"])
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

features = StandardScaler()
X_train_features = features.fit_transform(X_train.iloc[:,:-1])
X_test_features = features.transform(X_test.iloc[:,:-1])
model = RNN()
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit([sequences_matrix,X_train_features],Y_train,batch_size=24,epochs=10,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss')],verbose=1)


accr = model.evaluate([test_sequences_matrix,X_test_features],Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
#Sin features
#Preparar datos para test sin features
X_train = tweets_metrics.text
Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

#Comentar para generar submit
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=71,epochs=10,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss')])

#Comentar para generar submit - Sin features
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

### LSTM - TEST.csv (CON  features)

In [None]:
#Multiples parametros - Entrenar con todo el set
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

X_train = tweets_metrics.iloc[:,4:]
X_train["text"] = tweets_metrics["text"]

Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

X_test = test_metrics.iloc[:,3:]
X_test["text"] = test_metrics["text"]

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train["text"])

sequences = tok.texts_to_sequences(X_train["text"])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

test_sequences = tok.texts_to_sequences(X_test["text"])
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

features = StandardScaler()
X_train_features = features.fit_transform(X_train.iloc[:,:-1])
X_test_features = features.transform(X_test.iloc[:,:-1])
model = RNN()
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit([sequences_matrix,X_train_features],Y_train,batch_size=24,epochs=1,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss')],verbose=1)

submission = pd.DataFrame()
submission['id'] = test_metrics['id']
submission['prob'] = model.predict(test_sequences_matrix)
submission['target'] = submission['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission["prob"]
submission.head(10)
submission.to_csv("submit_prueba_X.csv", index=False)

## LSTM - TEST.csv (no tiene features)

In [None]:
X_test = tweets_test.text
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
submission = pd.DataFrame()
submission['id'] = tweets_test['id']
submission['prob'] = model.predict(test_sequences_matrix)
submission['target'] = submission['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission1["prob"]
submission.head(10)
submission.to_csv("submit_prueba_X.csv", index=False)

## Logistic Regresion

In [None]:
#BASE PARA TODOS LOS METODOS
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)
model = LogisticRegression()

### CountVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)

model = LogisticRegression()

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word')
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)

### TF-IDF

In [None]:
def tf_idf(): 
    from sklearn.linear_model import LogisticRegression
    x_train = tweets_metrics.text
    y_train = tweets_metrics.target

    x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.20)
    #x_test = test_metrics.text

    model = LogisticRegression(solver="newton-cg")

    from sklearn.feature_extraction.text import TfidfVectorizer
    
    vectorizer = TfidfVectorizer(analyzer="word", smooth_idf = False)
    vectorizer.fit(x_train)
    x_train = vectorizer.transform(x_train)
    x_test  = vectorizer.transform(x_test)

    model.fit(x_train, y_train)
    #Entrenarlo
    score = model.score(x_test, y_test)
    #print("Presicion:", score)
    return score

In [None]:
score = tf_idf()
iteracion = 0
while score < 0.82:
    score = tf_idf()
    print (f"{iteracion}:{score}")
    iteracion += 1

In [None]:
x_test = test_metrics.text
x_test  = vectorizer.transform(x_test)
submission = pd.DataFrame()
submission['id'] = test_metrics['id']
submission['prob'] = model.predict(x_test)
submission['target'] = submission['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission["prob"]
submission.head(10)
submission.to_csv("submit_prueba_16.csv", index=False)

### Hashing Vectorizer

In [None]:
 def h_vec():   
    from sklearn.linear_model import LogisticRegression
    x_train = tweets_metrics.text
    y_train = tweets_metrics.target

    x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.15)

    model = LogisticRegression(solver="liblinear")

    from sklearn.feature_extraction.text import HashingVectorizer

    vectorizer = HashingVectorizer(analyzer="word",n_features=60000)
    vectorizer.fit(x_train)
    x_train = vectorizer.transform(x_train)
    x_test  = vectorizer.transform(x_test)

    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)

    #print("Presicion:", score)
    return score

In [None]:
score = h_vec()
iteracion = 0
while score < 0.825:
    score = h_vec()
    iteracion += 1
    print (f"{iteracion}:{score}")

In [None]:
x_test = test_metrics.text
x_test  = vectorizer.transform(x_test)
submission = pd.DataFrame()
submission['id'] = test_metrics['id']
submission['prob'] = model.predict(x_test)
submission['target'] = submission['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission["prob"]
submission.head(10)
submission.to_csv("submit_prueba_17.csv", index=False)