In [None]:
from collections import Counter

import pymongo

import nltk
from nltk.tokenize import TweetTokenizer

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# MongoDB connection data
HOST = 'localhost'
PORT = '27017'
DB_NAME = 'trending'

### Connecting to database

In [None]:
db = None
try:
    db_client = pymongo.MongoClient(HOST + ':' + PORT)
    db = db_client[DB_NAME]
    print('Conecction established successfully!')
except pymongo.errors.ConnectionFailure:
    print('Connection cannot be established')

In [None]:
HISTORY_COL = db['history']

### Analizying trends (hashtags)

In [None]:
counter = Counter()

In [None]:
tweets = HISTORY_COL.find()

In [None]:
tweets.count()

In [None]:
tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True)

In [None]:
for tweet in tweets:
    try:
        terms_hash = [t for t in tokenizer.tokenize(tweet['full_text']) if (term.startswith('#') and len(term) > 1)]
        counter.update(terms_hash)
    except KeyError:
        pass

In [None]:
trends_df = pd.DataFrame.from_dict(counter, orient = 'index').reset_index()
trends_df.columns = ['Hashtag', 'Frecuencia']
trends_df = trends_df.sort_values(by = ['Frecuencia'], ascending = False)
trends_df = trends_df.head()

In [None]:
trends_df

In [None]:
ax = trends_df.plot(kind = 'barh', figsize = (20, 8))
ax.set_yticklabels(trends_df['Hashtag'])

### Creating and using text index

In [None]:
HISTORY_COL.create_index([('full_text', pymongo.TEXT)], default_language = 'spanish')

In [None]:
tweets = HISTORY_COL.find({'$text': {'$search': 'fraude'}})

In [None]:
tweets.count()

In [None]:
for tweet in tweets:
    print(tweet['full_text'].replace('\n', ''))

### Analyzing time

In [None]:
tweets = HISTORY_COL.find({}, {'created_at': True})

In [None]:
creation_dates = []
for tweet in tweets:
    try:
        creation_dates.append(tweet['created_at'])
    except:
        pass

In [None]:
plt.figure(figsize = (20, 8))
plt.hist(creation_dates)