In [1]:
import pandas as pd
import numpy as np
import nltk

import sys
import json
import re
import time

import MySQLdb

In [2]:
db = MySQLdb.connect(host="127.0.0.1", port=3309, user="root", passwd="root", db="tweets")
cursor = db.cursor()

In [3]:
def get_user_tweets_text(user_id, data_type, num=200):
    """returns list of tweets as dicts"""
    cmd = """
    SELECT timeline
    FROM {}
    WHERE user_id = %s;
    """.format('tweets_%s' % (data_type))
    cursor.execute(cmd, (user_id,))
    timeline = json.loads(cursor.fetchone()[0])
    return [msg['text'] for msg in timeline if not 'RT' in msg['text']][:num]

In [5]:
def get_words(text):
    """returns list of words"""
    text_nolinks = re.sub(r'http[^ ]+', '', text)
    words = re.split(r'[^\w#]', text_nolinks)
#     words = re.split(r'[\W]', text_nolinks)
    return [word for word in words if len(word) > 0 and not word.isnumeric()]

In [12]:
wnl = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.SnowballStemmer('english')
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
def get_tokens(words):
    """returns list of tokens"""
    tokens = []
    for word in words:
        tmp = stemmer.stem(word.lower())
        tmp = wnl.lemmatize(tmp)
        if tmp not in stopwords:
            tokens.append(tmp)
            
    return tokens

In [14]:
def get_tweet_tokens(tweet):
    return get_tokens(get_words(tweet))

In [15]:
def collect_users_tokens(df_users, data_type, tweets_num=200):
    """returns users list and list of user dicts. Each dict contains frequence of user tokens"""
    users = df_users['twitter_id'].values
    user_dicts = []
    leng = users.size
    for i, user in enumerate(users):
        tokens = []
        
        sys.stdout.write("\rProgress: %.2f%% %d          " % (100.0 * (i + 1) / leng, user))
        sys.stdout.flush()
        
        tweets = get_user_tweets_text(user, data_type, num=tweets_num)

        for tweet in tweets:
            tokens += get_tweet_tokens(tweet)

        uniq_tokens = np.unique(tokens, return_counts=True)
        user_dicts.append(dict(zip(uniq_tokens[0], uniq_tokens[1])))
        
        del(tokens)
    return users, user_dicts

In [None]:
TRAINING_SET_URL = "../twitter_train.csv"
TESTING_SET_URL = "../twitter_test.csv"
df_train = pd.read_csv(TRAINING_SET_URL)
df_test = pd.read_csv(TESTING_SET_URL)

_time = time.time()
users_tr, users_tr_tokens = collect_users_tokens(df_train, 'train', tweets_num=300)
users_te, users_te_tokens = collect_users_tokens(df_test, 'test', tweets_num=300)
print(time.time() - _time)

Progress: 0.83% 357859404          

In [None]:
db.close()
np.savez("train.dat", users=users_tr, users_tokens=users_tr_tokens)
np.savez("test.dat", users=users_te, users_tokens=users_te_tokens)

In [None]:
# from sklearn.feature_extraction import DictVectorizer
# v = DictVectorizer()
# vs = v.fit_transform(users_tokens)