In [8]:
import numpy as np
import pandas as pd
from datetime import datetime
import csv
from pymongo import MongoClient
from configparser import ConfigParser

In [9]:
mongoClient = MongoClient()
db = mongoClient.tweets

In [10]:
parser = ConfigParser()
parser.read('../config.ini')
query_terms = list(parser.get('FILTER', 'filter_terms').split(','))

### Create Sample

#### Sampling method

- Taking 1% tweets of each company
- Create a csv file of cleaned data

In [11]:
from bs4 import BeautifulSoup

In [12]:
def get_full_text(status):
    if "retweeted_status" in status: # Check if Retweet
        try:
            return status["retweeted_status"]["extended_tweet"]["full_text"]
        except KeyError:
            return status["retweeted_status"]["text"]
    else:
        try:
            return status["extended_tweet"]["full_text"]
        except KeyError:
            return status["text"]

def get_tweet_source(status):
    if "retweeted_status" in status:
        return status["retweeted_status"]['id_str']
    else:
        return status['id_str']

def extract_source_device(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    return soup.text

def extract_tweet_data(status):
    data = []
    data.append(get_full_text(status))
    data.append(int(status['timestamp_ms'])/1000)
    data.append(status['user']['screen_name'])
    data.append(get_tweet_source(status))
    data.append(status['id_str'])
    data.append(status['user']['location']) # User location is useless
    data.append(extract_source_device(status['source']))
    return data

In [36]:
for term in query_terms:
    count = db[term].estimated_document_count()
    tweets = db[term].find()
    sample_tweets = []
    for index in np.random.choice(count, int(count / 100)):
        sample_tweets.append(tweets[int(index)])
    print(term, 'samples:', len(sample_tweets))

    with open('../data/' + term + '_sample.csv', 'w', encoding='utf-8') as file:
        csvwriter = csv.writer(file)
        for tweet in sample_tweets:
            csvwriter.writerow(extract_tweet_data(tweet))
        print(term, 'sample file created')

google samples: 4480
google sample file created
tesla samples: 777
tesla sample file created
apple samples: 4516
apple sample file created
spacex samples: 94
spacex sample file created
amazon samples: 3241
amazon sample file created
microsoft samples: 442
microsoft sample file created
facebook samples: 3395
facebook sample file created


### Create Full Data csv

In [None]:
for term in query_terms:
    with open('../data/' + term + '.csv', 'w', encoding='utf-8') as file:
        csvwriter = csv.writer(file)
        for tweet in db[term].find():
            csvwriter.writerow(extract_tweet_data(tweet))
        print(term, 'file created')