In [100]:
import string
import pandas as pd
import numpy as np
from nltk import word_tokenize
from collections import Counter
import operator
from sklearn.preprocessing import MultiLabelBinarizer

### Read Data

In [101]:
df = pd.read_csv('db.csv', sep='\t')

### Getting sample

In [102]:
df_0 = df[df['sentiment'] == 0].sample(n = int(0.05*df.shape[0]))
df_1 = df[df['sentiment'] == 1].sample( n= int(0.05*df.shape[0]))

In [103]:
df = df_0.append(df_1)

### Generate Tokens

In [104]:
def remove_punctuation(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

In [105]:
df['tokens'] = df.apply(lambda row: Counter(word_tokenize(remove_punctuation(row['text'].lower()))), axis=1)

### Inverted Index

In [106]:
indice_invertido = {}


for token_list, _id in zip(df.tokens, df.id):
    for token in token_list.keys():
        if token not in indice_invertido.keys():
            indice_invertido[token] = [_id]
        else:
            indice_invertido[token].append(_id)

In [107]:
def idf(termo):
    '''
        Função que retorna o idf. Baseia-se na presença do termo nos documentos do corpus.
        params:
            termo (str): termo alvo a ser calculado o idf.
        return:
            (float): o idf do termo.
    '''
    
    N = df.shape[0] # tamanho do corpus
    return np.log(N/len(indice_invertido[termo.lower().strip()]))

In [108]:
def tf(termo):
    return len(indice_invertido[termo])

### Calculating TF-IDF for each termn in tweets

In [109]:
def get_tfidfs(tokens_list):
    resp = {}
    for token in tokens_list:
        resp[token] = tf(token)*idf(token)
    return resp

In [110]:
def get_topics(tokens_list, n=35):
    tfidfs = get_tfidfs(tokens_list)
    
    sorted_d = sorted(tfidfs.items(), key=operator.itemgetter(1),reverse=True)
    
    return [topic[0] for topic in sorted_d[:n]] 
    

#### Getting topics for each tweet

In [111]:
df['topics'] = df.apply(lambda row: get_topics(row['tokens']), axis=1)

In [112]:
df_to_train = df[['id','topics', 'sentiment']]

In [None]:
df_to_train = df_to_train.drop('topics', 1).join(df.topics.str.join('|').str.get_dummies())

### Classifier

In [None]:
features = df_to_train[[i for i in list(df_to_train.columns) if i not in ['id', 'sentiment']]]

In [None]:
target = df_to_train[['sentiment']]

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [None]:
model = gnb.fit(features, target)

In [None]:
model

### Prediction

In [None]:
Counter(target['sentiment'])

In [None]:
Counter(model.predict(features))