In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
tweets = pd.read_csv("train.csv", usecols=['id','text', 'target'])
test = pd.read_csv("test.csv")

In [3]:
duplicates = tweets.duplicated(subset = 'text', keep = False)
duplicates.value_counts()

False    7434
True      179
dtype: int64

In [4]:
import string

tweets['text'] = tweets['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this earthquake M...,1
1,4,Forest fire near La Ronge Sask Canada,1
2,5,All residents asked to shelter in place are ...,1
3,6,13 000 people receive wildfires evacuation or...,1
4,7,Just got sent this photo from Ruby Alaska as ...,1


In [5]:
tweets['text'] = tweets['text'].apply(lambda x: x.lower())

In [6]:
import io
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/agustin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

In [8]:
tweets['text'] = tweets['text'].str.split()
tweets['text'] = tweets['text'].apply(remove_stopword)
tweets.head()

Unnamed: 0,id,text,target
0,1,deeds reason earthquake may allah forgive us,1
1,4,forest fire near la ronge sask canada,1
2,5,residents asked shelter place notified officer...,1
3,6,people receive wildfires evacuation orders cal...,1
4,7,got sent photo ruby alaska smoke wildfires pou...,1


In [9]:
stemmer = SnowballStemmer('english')

In [10]:
def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

In [11]:
tweets['text'] = tweets['text'].apply(stemm)
tweets.head()

Unnamed: 0,id,text,target
0,1,deed reason earthquak may allah forgiv us,1
1,4,forest fire near la rong sask canada,1
2,5,resid ask shelter place notifi offic evacu she...,1
3,6,peopl receiv wildfir evacu order california,1
4,7,got sent photo rubi alaska smoke wildfir pour ...,1


### TF-IDF

In [12]:
x_train, x_test, y_train, y_test = \
train_test_split(tweets['text'], tweets['target'], test_size = 0.25, random_state = 123)

In [13]:
tfid_vectorizer = TfidfVectorizer()
train_vectors = tfid_vectorizer.fit_transform(x_train)
test_vectors = tfid_vectorizer.transform(x_test)
print(train_vectors.shape, test_vectors.shape)

(5709, 11191) (1904, 11191)


#### Naive Bayes básico

En el caso de una predicción binaria es recomendable bernoulli

In [14]:
from sklearn.naive_bayes import BernoulliNB

bernoulli = BernoulliNB().fit(train_vectors, y_train)

In [15]:
predicted = bernoulli.predict(test_vectors)
print(accuracy_score(y_test, predicted))

NameError: name 'accuracy_score' is not defined

#### Tunning

In [16]:
tfid_matrix = tfid_vectorizer.transform(x_train)
array = tfid_matrix.todense()

In [17]:
df = pd.DataFrame(array)
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11181,11182,11183,11184,11185,11186,11187,11188,11189,11190
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
features = df.columns

In [19]:
alpha_b = np.linspace(0.1, 0.9, 20)
alpha_b = np.around(alpha_b, decimals=4)
alpha_b

array([0.1   , 0.1421, 0.1842, 0.2263, 0.2684, 0.3105, 0.3526, 0.3947,
       0.4368, 0.4789, 0.5211, 0.5632, 0.6053, 0.6474, 0.6895, 0.7316,
       0.7737, 0.8158, 0.8579, 0.9   ])

In [20]:
grid_m = [{"alpha":alpha_b}]

In [21]:
classifier = BernoulliNB()
gridsearch = GridSearchCV(classifier, grid_m, scoring = 'neg_log_loss', cv = 4)
gridsearch.fit(df[features], y_train)
print("Best parameter: ",gridsearch.best_params_)

NameError: name 'GridSearchCV' is not defined

In [22]:
bernoulli_t = BernoulliNB(alpha=0.9).fit(train_vectors, y_train)
predicted_t = bernoulli_t.predict(test_vectors)
print(accuracy_score(y_test, predicted_t))

NameError: name 'accuracy_score' is not defined

* Submit de prueba

In [27]:
test_vectors = tfid_vectorizer.transform(test['text'])
print(test_vectors.shape, train_vectors.shape)
predicted_t = bernoulli_t.predict(test_vectors)
predicted_t

(3263, 11191) (5709, 11191)


array([0, 0, 1, ..., 1, 1, 0])

In [24]:
test['target'] = predicted_t

In [25]:
submit_kaggle = test.loc[:, ['id', 'target']]
submit_kaggle.to_csv("submit_prueba_1.csv", index=False)

### LightGBM

* TF-IDF vectorization

In [None]:
vectorizer = TfidfVectorizer()
tfidf_lgbm = vectorizer.fit_transform(tweets.loc[:, 'text'])
array = tfidf_lgbm.todense()

In [None]:
df = pd.DataFrame(array)
df['output'] = tweets['target']
df.head(10)

In [None]:
features = df.columns.drop('output')

In [None]:
x = df.loc[:, features].values
y = df.loc[:, 'output'].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 123)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(x_train, label=y_train)
params = {}
params['learning_rate'] = 0.005
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 50
params['min_data'] = 100
params['max_depth'] = 50
params['max_bin'] = 50
params['num_iterations'] = 200
gbm = lgb.train(params, d_train,
                num_boost_round=100)

In [None]:
y_pred = clf.predict(x_test)

for i in range (0, len(y_pred)):
    if i <= 0.5:       
        y_pred[i] = 1 
    else:  
        y_pred[i]=0

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred,y_test)

In [None]:
print(accuracy)