# Analyzing tweets

In [None]:
from collections import Counter

import pymongo

import nltk
from nltk.tokenize import TweetTokenizer

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# MongoDB connection data
HOST = '146.148.63.14'
PORT = '27017'
DB_NAME = 'trending'

### Connecting to database

In [None]:
db = None
try:
    db_client = pymongo.MongoClient(HOST + ':' + PORT)
    db = db_client[DB_NAME]
    print('Conecction established successfully!')
except pymongo.errors.ConnectionFailure:
    print('Connection cannot be established')

In [None]:
HISTORY_COL = db['history']

### Visualizing trends (hashtags and mentions)

In [None]:
hashtags_counter = Counter()
mentions_counter = Counter()

In [None]:
tweets = HISTORY_COL.find({}, {'_id': 0, 'full_text': 1})

In [None]:
tweets.count()

In [None]:
# https://www.nltk.org/_modules/nltk/tokenize/casual.html
tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True)

In [None]:
for tweet in tweets:
    try:
        terms_hash = [t for t in tokenizer.tokenize(tweet['full_text']) if (t.startswith('#') and len(t) > 1)]
        hashtags_counter.update(terms_hash)
        
        terms_mentions = [t for t in tokenizer.tokenize(tweet['full_text']) if (t.startswith('@') and len(t) > 1)]
        mentions_counter.update(terms_mentions)
    except KeyError:
        pass

In [None]:
hashtags_df = pd.DataFrame.from_dict(hashtags_counter, orient = 'index').reset_index()
hashtags_df.columns = ['Hashtag', 'Frecuency']
hashtags_df = hashtags_df.sort_values(by = ['Frecuency'], ascending = False)
hashtags_df = hashtags_df.head(10)

In [None]:
hashtags_df

In [None]:
mentions_df = pd.DataFrame.from_dict(mentions_counter, orient = 'index').reset_index()
mentions_df.columns = ['Mention', 'Frecuency']
mentions_df = mentions_df.sort_values(by = ['Frecuency'], ascending = False)
mentions_df = mentions_df.head(10)

In [None]:
mentions_df

In [None]:
ax = hashtags_df.plot(kind = 'barh', figsize = (20, 8))
ax.set_yticklabels(hashtags_df['Hashtag'])
plt.title('Top hashtags by frecuency')
plt.show()

In [None]:
ax = mentions_df.plot(kind = 'barh', figsize = (20, 8))
ax.set_yticklabels(mentions_df['Mention'])
plt.title('Top mentions by frecuency')
plt.show()

### Creating and using text index

In [None]:
HISTORY_COL.create_index([('full_text', pymongo.TEXT)], default_language = 'spanish')

In [None]:
tweets = HISTORY_COL.find(
   { '$text': { '$search': 'petro atentado' } },
   { 'score': { '$meta': 'textScore' } }
).sort([('score', {'$meta': 'textScore'})])

In [None]:
tweets.count()

In [None]:
for tweet in tweets[:5]:
    print('Text:', tweet['full_text'])
    print('Score:', tweet['score'])
    print('\n')

### Analyzing time

In [None]:
tweets = HISTORY_COL.find({}, {'created_at': True})

In [None]:
creation_dates = []
for tweet in tweets:
    try:
        creation_dates.append(tweet['created_at'])
    except:
        pass

In [None]:
plt.figure(figsize = (20, 8))
plt.hist(creation_dates, bins = 100)
plt.show()