In [1]:
import numpy as np
import pandas as pd
import csv
from pymongo import MongoClient
from configparser import ConfigParser

In [2]:
mongoClient = MongoClient()
db = mongoClient.tweets

In [3]:
parser = ConfigParser()
parser.read('../config.ini')
query_terms = list(parser.get('FILTER', 'filter_terms').split(','))

In [4]:
cols = ['text','timestamp','user','tweet_source','tweet_id','user_location','source_device']

## Create Data

In [5]:
from bs4 import BeautifulSoup

In [6]:
def get_full_text(status):
    if "retweeted_status" in status: # Check if Retweet
        try:
            return status["retweeted_status"]["extended_tweet"]["full_text"]
        except KeyError:
            return status["retweeted_status"]["text"]
    else:
        try:
            return status["extended_tweet"]["full_text"]
        except KeyError:
            return status["text"]

def get_tweet_source(status):
    if "retweeted_status" in status:
        return status["retweeted_status"]['id_str']
    else:
        return status['id_str']

def extract_source_device(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    return soup.text

def extract_tweet_data(status):
    data = []
    data.append(str(get_full_text(status)))
    data.append(int(int(status['timestamp_ms'])/1000))
    data.append(str(status['user']['screen_name']))
    data.append(str(get_tweet_source(status)))
    data.append(str(status['id_str']))
    data.append(str(status['user']['location'])) # User location is useless
    data.append(str(extract_source_device(status['source'])))
    return data

### Create Sample Data CSV

**Sampling method**
- Taking 1% tweets of each company
- Create a csv file of cleaned data

In [None]:
for term in query_terms:
    count = db[term].estimated_document_count()
    tweets = db[term].find()
    sample_tweets = []
    for index in np.random.choice(count, int(count / 100)):
        sample_tweets.append(tweets[int(index)])
    print(term, 'samples:', len(sample_tweets))

    with open('../data/' + term + '_sample.csv', 'w', encoding='utf-8') as file:
        csvwriter = csv.writer(file)
        csvwriter.writerow(cols)
        for tweet in sample_tweets:
            csvwriter.writerow(extract_tweet_data(tweet))
        print(term, 'sample file created')

google samples: 4480
google sample file created
tesla samples: 777
tesla sample file created
apple samples: 4516
apple sample file created
spacex samples: 94
spacex sample file created


## Create Full Data csv

In [None]:
for term in query_terms:
    with open('../data/' + term + '.csv', 'w', encoding='utf-8') as file:
        csvwriter = csv.writer(file)
        csvwriter.writerow(cols)
        for tweet in db[term].find():
            if tweet['lang'] == 'en':
                csvwriter.writerow(extract_tweet_data(tweet))
        print(term, 'file created')

## Cleaning
- Remove
    - All urls
    - '#' form infront of hashtags
    - mentions
    - emojis
    - Non ASCII chanracters
- Extract country form 'source_device'

In [None]:
import re
import preprocessor as p
from datetime import datetime
from geotext import GeoText

# Don't remove hastags
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.NUMBER, p.OPT.SMILEY, p.OPT.RESERVED, p.OPT.MENTION)

NOT_BASIC_LATIN_PATTERN = re.compile(u'[^\u0000-\u007F]')
PUNCTUATIONS_PATTERN = re.compile(r'[\#\$\%\&\(\)\*\+\-\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]')
MULTIPLE_SPACES_PATTERN = re.compile(r' +')

def clean_text(text):
    text = str(text).lower()
    text = p.clean(text) # Clean using tweet-preprocessor except hashtags
    text = NOT_BASIC_LATIN_PATTERN.sub(' ', text) # Remove everything except basic latin
    text = re.sub(r'&amp;', 'and', text)
    text = PUNCTUATIONS_PATTERN.sub(' ', text) # Remove all punctuations
    text = MULTIPLE_SPACES_PATTERN.sub(' ', text) # Remove multiple consequent spaces
    return text.strip()

def get_country(data):
    country_men = list(GeoText(data).country_mentions.keys())
    if len(country_men) > 0:
        return country_men[0]
    return ''

### Sample Data Cleaning

In [None]:
%%time
for term in query_terms:
    file_addr = '../data/' + term + '_sample' + '.csv'
    tweets = pd.read_csv(file_addr)
    # Clean the tweet text
    tweets['clean_text'] = tweets['text'].apply(lambda x: clean_text(x))
    # Extract 'country' from 'user_location'
    tweets['country'] = tweets['user_location'].fillna('').apply(lambda x: get_country(x))
    tweets = tweets.drop(['user_location'], axis=1)
    tweets.to_csv(file_addr)
    print(term, 'file cleaned')

### Complete Data Cleaning

In [None]:
%%time
for term in query_terms:
    file_addr = '../data/' + term + '.csv'
    tweets = pd.read_csv(file_addr)
    # Clean the tweet text
    tweets['clean_text'] = tweets['text'].apply(lambda x: clean_text(x))
    # Extract 'country' from 'user_location'
    tweets['country'] = tweets['user_location'].fillna('').apply(lambda x: get_country(x))
    tweets = tweets.drop(['user_location'], axis=1)
    tweets.to_csv(file_addr)
    print(term, 'file cleaned')