In [1]:
!pip install --upgrade pip
!pip install pymongo
!pip install -U nltk
!pip install joblib

Collecting pip
  Downloading https://files.pythonhosted.org/packages/46/dc/7fd5df840efb3e56c8b4f768793a237ec4ee59891959d6a215d63f727023/pip-19.0.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 403kB/s 
[?25hInstalling collected packages: pip
  Found existing installation: pip 9.0.3
    Uninstalling pip-9.0.3:
      Successfully uninstalled pip-9.0.3
Successfully installed pip-19.0.1
Collecting pymongo
[?25l  Downloading https://files.pythonhosted.org/packages/b1/45/5440555b901a8416196fbf2499c4678ef74de8080c007104107a8cfdda20/pymongo-3.7.2-cp36-cp36m-manylinux1_x86_64.whl (408kB)
[K    100% |████████████████████████████████| 409kB 587kB/s 
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-3.7.2
Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/6f/ed/9c755d357d33bc1931e157f537721efb5b88d2c583fe593cc09603076cc3/nltk-3.4.zip (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 1.6MB/s 
Buildi

In [68]:
from pymongo import MongoClient
import pandas as pd
import re
import nltk

from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from joblib import dump, load

#Download do corpus da nltk
#nltk.download()

In [69]:
def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

In [70]:
def read_mongo(db, collection, query={}, host='ds249824.mlab.com', port='49824', username='app', password='nodeapp01', no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [71]:
acidente = read_mongo('labeling_zone','tweets',query={'label':'Sim'})
nao_acidente = read_mongo('labeling_zone','tweets',query={'label':'Não'})
full_data = acidente.append(nao_acidente[0:170])

full_data.head()

Unnamed: 0,data,id,label,labeled_by,texto,usuario
0,Sun Dec 16 11:27:33 +0000 2018,1074264765747335169,Sim,,"""""""Cidadão de bem"""""" confessou ter matado duas...",AIice_Costa
1,Sat Dec 29 10:22:21 +0000 2018,1078959397752266753,Sim,,"""#INFO #RDRJ temporealnews RT bandnewsfmrio: P...",TempoRealNews
2,Fri Dec 28 10:22:18 +0000 2018,1078596997848928257,Sim,,"""#RDRJ RT OperacoesRio: AV. BRASIL | BONSUCESS...",TempoRealNews
3,Thu Dec 27 10:03:12 +0000 2018,1078229806150496256,Sim,,"""#RDRJ RT OperacoesRio: CAMINHO PARA RODOVIÁRI...",TempoRealNews
4,Sat Dec 29 09:52:11 +0000 2018,1078951805688705024,Sim,,"""#RDRJ RT OperacoesRio: LINHA VERMELHA - Senti...",TempoRealNews


In [72]:
def remove_urls(text):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', text)

def to_lower(text):
    return text.lower()

In [73]:
full_data['texto_formatado'] = full_data['texto'].apply(lambda t: remove_urls(str(t)))
full_data['texto_formatado'] = full_data['texto_formatado'].apply(lambda t: to_lower(t))

In [74]:
tokenizer = TreebankWordTokenizer()
full_data['tokens'] = full_data['texto_formatado'].apply(lambda t: tokenizer.tokenize(t))
full_data[['texto_formatado','tokens']].head()

Unnamed: 0,texto_formatado,tokens
0,"""""""cidadão de bem"""""" confessou ter matado duas...","[``, ``, '', cidadão, de, bem, '', '', '', con..."
1,"""#info #rdrj temporealnews rt bandnewsfmrio: p...","[``, #, info, #, rdrj, temporealnews, rt, band..."
2,"""#rdrj rt operacoesrio: av. brasil | bonsucess...","[``, #, rdrj, rt, operacoesrio, :, av., brasil..."
3,"""#rdrj rt operacoesrio: caminho para rodoviári...","[``, #, rdrj, rt, operacoesrio, :, caminho, pa..."
4,"""#rdrj rt operacoesrio: linha vermelha - senti...","[``, #, rdrj, rt, operacoesrio, :, linha, verm..."


In [75]:
def create_stopword_list():
    portuguese_stops = set(stopwords.words('portuguese'))
    portuguese_stops.add('rt')

    with open('punctuation.txt','r+') as punct_file:
        puncts = punct_file.readlines()

    for item in puncts:    
        portuguese_stops.add(item.strip())
        
    return portuguese_stops

stop_w = create_stopword_list()

#aplica a remocao de stop-words
full_data['words'] = full_data['tokens'].apply(lambda w: [word for word in w if word not in stop_w]) 
#exibe resultado intermediario
full_data[['tokens','words']].head(10)

Unnamed: 0,tokens,words
0,"[``, ``, '', cidadão, de, bem, '', '', '', con...","[cidadão, bem, confessou, ter, matado, duas, t..."
1,"[``, #, info, #, rdrj, temporealnews, rt, band...","[info, rdrj, temporealnews, bandnewsfmrio, cau..."
2,"[``, #, rdrj, rt, operacoesrio, :, av., brasil...","[rdrj, operacoesrio, av., brasil, bonsucesso, ..."
3,"[``, #, rdrj, rt, operacoesrio, :, caminho, pa...","[rdrj, operacoesrio, caminho, rodoviária, trân..."
4,"[``, #, rdrj, rt, operacoesrio, :, linha, verm...","[rdrj, operacoesrio, linha, vermelha, sentido,..."
5,"[``, #, rdrj, rt, operacoesrio, :, tanque, |, ...","[rdrj, operacoesrio, tanque, rua, cândido, ben..."
6,"[``, erro, humano, '', na, origem, do, acident...","[erro, humano, origem, acidente, elétrico, 25...."
7,"[``, gado, na, pista, provoca, acidente, '', k...","[gado, pista, provoca, acidente, kkkkkkkkkkk]"
8,"[``, jovem, perde, a, vida, em, grave, acident...","[jovem, perde, vida, grave, acidente, avenida,..."
9,"[``, manu, ,, n, surta, ,, sofri, um, acidente...","[manu, n, surta, sofri, acidente, moto, quase,..."


In [76]:
stemer= SnowballStemmer(language='portuguese')
full_data['stem_words'] = full_data['words'].apply(lambda t: [stemer.stem(word) for word in t])
full_data[['words','stem_words']].head()

Unnamed: 0,words,stem_words
0,"[cidadão, bem, confessou, ter, matado, duas, t...","[cidadã, bem, confess, ter, mat, duas, técnic,..."
1,"[info, rdrj, temporealnews, bandnewsfmrio, cau...","[info, rdrj, temporealnews, bandnewsfmri, caus..."
2,"[rdrj, operacoesrio, av., brasil, bonsucesso, ...","[rdrj, operacoesri, av., brasil, bonsucess, ac..."
3,"[rdrj, operacoesrio, caminho, rodoviária, trân...","[rdrj, operacoesri, caminh, rodoviár, trânsit,..."
4,"[rdrj, operacoesrio, linha, vermelha, sentido,...","[rdrj, operacoesri, linh, vermelh, sent, centr..."


In [89]:
def encode_label(text):
    if text=='Sim':
        return 1
    elif text=='Não':
        return 0
    else:
        return -1

full_data['target'] = full_data['label'].apply(lambda label: encode_label(label))
full_data['target'].describe()

count    340.000000
mean       0.500000
std        0.500737
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        1.000000
Name: target, dtype: float64

In [78]:
full_data['clean_text'] = full_data['stem_words'].apply(lambda t: str(' '.join(t)))

count_vect = CountVectorizer()
X_TF = count_vect.fit_transform(full_data['clean_text'])
print(X_TF.shape)

(340, 2050)


In [79]:
print(X_TF.toarray()) 

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [80]:
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_TF)
X.shape

(340, 2050)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, full_data['target'], test_size=0.33, random_state=42)

In [82]:
nb = MultinomialNB().fit(X_train, y_train)
svm = svm.SVC().fit(X_train,y_train)
tree = tree.DecisionTreeClassifier().fit(X_train,y_train)
logr = LogisticRegression().fit(X_train,y_train)

In [95]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
def print_metrics(y_true,y_predicted):
    acc = accuracy_score(y_test.values,y_predicted)
    print('Acurácia: ' + str(acc))
    #accuracy_score(y_true, y_predicted)
    print('Matriz de Confusão:' )
    print(confusion_matrix(y_true, y_predicted, labels=[0, 1]))
    print('Recall')
    print(recall_score(y_true, y_predicted, average='macro')  )
    print('Precision')
    print(precision_score(y_true, y_predicted, average='macro')  )

In [96]:
print('Naive Bayes')
print_metrics(y_test,nb.predict(X_test))
print('\n')
print('Decision Tree')
print_metrics(y_test,tree.predict(X_test))
print('\n')
print('Suport Vector Machines')
print_metrics(y_test,svm.predict(X_test))
print('\n')
print('Logistic Regressor')
print_metrics(y_test,logr.predict(X_test))

Naive Bayes
Acurácia: 0.867256637168
Matriz de Confusão:
[[41 12]
 [ 3 57]]
Recall
0.86179245283
Precision
0.87895256917


Decision Tree
Acurácia: 0.823008849558
Matriz de Confusão:
[[45  8]
 [12 48]]
Recall
0.824528301887
Precision
0.823308270677


Suport Vector Machines
Acurácia: 0.469026548673
Matriz de Confusão:
[[53  0]
 [60  0]]
Recall
0.5
Precision
0.234513274336


Logistic Regressor
Acurácia: 0.814159292035
Matriz de Confusão:
[[50  3]
 [18 42]]
Recall
0.821698113208
Precision
0.83431372549


  'precision', 'predicted', average, warn_for)
