In [1]:
# Load data

import sys
import datetime
import psycopg2 as ppg
import pandas as pd
import numpy as np

con = ppg.connect("dbname=tweets user=patrick")
print con

usa = (-125.6791025,25.4180700649,-66.885417,
                           49.3284551525)
la = ( -119.578941, 32.799580, -114.130814, 35.809120)
    
def in_place(pts, box, long_field='tweet_long', lat_field='tweet_lat'):
    return pd.Series((pts[long_field] > box[0]) & 
                     (pts[lat_field] > box[1]) &
                     (pts[long_field] < box[2]) &
                     (pts[lat_field] < box[3]))

with con.cursor() as cur:
    cur.execute("SELECT tw.tweet_id, tw.user_id, tw.lat as tweet_lat, tw.long as tweet_long, "
                "tw.created_at, user_locs.followers_count,user_locs.friends_count, "
                "user_locs.place as user_location, plcs.user_lat, "
                "plcs.user_long "
                "FROM tweets as tw, (SELECT user_id, place, friends_count, followers_count "
                "FROM users) as user_locs, "
                "(SELECT place, coded_lat as user_lat, coded_long as user_long FROM "
                "places) as plcs "
                "WHERE tw.user_id = user_locs.user_id and user_locs.place != 'None' "
                "and plcs.place = user_locs.place AND plcs.user_lat != -500 AND "
                "plcs.user_long != 500 AND (user_locs.place ~* '[A-Z\s]+,\s*[A-Z]+' OR "
                "user_locs.place ~* '(New York City|NYC|LA|Los Angeles|Chicago|DC|SF|San Francisco|"
                "Detroit|Houston|Dallas|Atlanta|Philly|Philadelphia|Phoenix|San Antonio| "
                "San Jose|Austin|Jacksonville|Indianapolis|Denver)') AND "
                "tw.text !~* '#insurancejobs|#jobs?|#hiring|#tweetmyjobs';")
    
    geotags = pd.DataFrame(cur.fetchall(), 
                       columns=[c[0] for c in cur.description])
    
geotags['in_us'] = in_place(geotags[['tweet_long','tweet_lat']], usa)
geotags['in_la'] = in_place(geotags[['tweet_long','tweet_lat']], la)

geotags= geotags.merge(geotags.groupby('user_id').apply(lambda x: pd.Series(x.shape[0], index=['num_tweets_user'])),
                    left_on='user_id', right_index=True, how='left')

# drop certain users
geotags = geotags[(geotags.followers_count < 2000) & 
                  (geotags.friends_count < 1800) &
                  (geotags.num_tweets_user < 200) &
                  (geotags.num_tweets_user >= 3) &
                  (geotags.friends_count / geotags.followers_count >= 0.25)]

# look at distance between user location and tweet location

geotags['tweet_distance'] = np.sqrt((geotags.user_lat - geotags.tweet_lat) ** 2 + 
                                    (geotags.user_long - geotags.tweet_long) ** 2)
geotags['on_road'] = geotags['tweet_distance'] > 0.5
geotags['time_of_day'] =  geotags.created_at.map(lambda x: x.hour)
geotags['day_of_week'] =  geotags.created_at.map(lambda x: x.weekday())


# get number of users w/ any on-road tweets, other per-user stats
users_on_road=geotags.groupby('user_id').apply(lambda x: pd.Series([x.on_road.any(), x.on_road.sum()],
         index=['any_on_road','num_on_road']))
geotags = geotags.merge(users_on_road, left_on='user_id', right_index=True)

# get locations for tweets in LA
import re
# get number of distinct users
print "Number of users: ", geotags[geotags.in_la].user_id.drop_duplicates().shape[0]
# their locations
#print geotags[geotags.in_la][['user_id','user_location']].drop_duplicates().user_location
def normalizer(txt):
    txt = txt.strip()
    txt = txt.lower()
    
    txt = re.sub(ur"[,.]", u" ", txt,  re.UNICODE)
    txt = re.sub(ur"\s+", u" ", txt, re.UNICODE)
    txt = re.sub(ur"cali[^\s]+", u"ca", txt, re.UNICODE | re.IGNORECASE)
    return txt
#print geotags[geotags.in_la][['user_id','user_location']].drop_duplicates().user_location.map(normalizer).value_counts().index.tolist()

# get places
with con.cursor() as cur:
    cur.execute("SELECT user_id, place, coded_lat, coded_long FROM users "
                "JOIN (SELECT place, coded_lat, coded_long FROM places) AS places USING (place)")
    places_df = pd.DataFrame(cur.fetchall(), columns = [c[0] for c in cur.description])
places_df['places_norm'] = places_df.place.map(lambda x: x.decode('utf-8')).map(normalizer)
places_df = places_df.merge(pd.DataFrame(geotags[geotags.in_la][['user_id','user_location']]. \
                                         drop_duplicates().user_location.map(lambda x: x.decode('utf-8')).
                                         map(normalizer)),
               left_on='places_norm',right_on='user_location')

places_df['in_la'] = places_df['coded_long'].map(lambda x: (x > la[0]) & (x < la[2])) & \
                     places_df['coded_lat'].map(lambda x: (x > la[1]) & (x < la[3])) | \
                    (places_df.places_norm == "ca") | (places_df.places_norm == "la") | \
                    (places_df.places_norm == 'san diego ca')
        
# limit to ppl who tweet in LA
geotags_la = geotags[geotags.in_la].merge(places_df, on='user_id', suffixes = ('_tweet','_user')).drop_duplicates()

with con.cursor() as cur:
    cur.execute("SELECT tweet_id, text, user_id, tokens FROM tweets")
    tweets_df = pd.DataFrame(cur.fetchall(), columns=[c[0] for c in cur.description])
la_tourist_tweets = geotags_la[geotags_la.in_la_user==False].merge(tweets_df, on='tweet_id')
la_native_tweets = geotags_la[geotags_la.in_la_user==True].merge(tweets_df, on='tweet_id')

# add tweet text
geotags_la = geotags_la.merge(tweets_df, on='tweet_id', how='left')


<connection object at 0x7f4da9ecdd70; dsn: 'dbname=tweets user=patrick', closed: 0>
Number of users:  1486


In [60]:
# language modeling

# TODO: think about URLs, 3-grams, apostrophes

from nltk.util import ngrams
from collections import Counter
import ark_twokenize_py as ark

def normalizer(txt):
    txt = txt.strip()
    txt = txt.lower()
    
    txt = re.sub(ur"[,.]", u" ", txt,  re.UNICODE)
    txt = re.sub(ur"\s+", u" ", txt, re.UNICODE)
    txt = re.sub(ur"cali[^\s]+", u"ca", txt, re.UNICODE | re.IGNORECASE)
    return txt

def add_boundaries(tkns):
    return ["#"] + tkns + ["#"]

def get_ngram_model(stuff, n=2):
    my_ngrams = ngrams([tkn for tkns in map(normalizer, stuff) for tkn in add_boundaries(tkns.split())], n)
    return dict(Counter(my_ngrams))

def ngram_inference(ngram_dict, new_string, n=2):
    """Don't try to scale this above n=2 yet"""
    if n==2:
        ngram_dict[('#','#')] = 0
    num_tokens = np.sum(list(ngram_dict.itervalues()))
    
    
    # tokenize new string
    new_tokens = ark.tokenize(normalizer(new_string))
    if n==2:
        new_tokens = add_boundaries(new_tokens)
    
    # get bigrams
    new_ngrams = ngrams(new_tokens, n)
    # get counts
    new_counts = { k : ngram_dict.get(k, 1) for k in new_ngrams }
    #print new_counts
    #print num_tokens
    # counts to frequencies
    new_freqs = [ float(v)/float(num_tokens) for v in new_counts.itervalues() ]
    
    # string probability
    return np.prod(new_freqs)

In [62]:
stuff = [u"hello there okay"]

ngram_dict = get_ngram_model(la_native_tweets.tokens.dropna().map(
            lambda x: x.decode('utf-8')), n=1)
ngram_dict_tour = get_ngram_model(la_tourist_tweets.tokens.dropna().map(
            lambda x: x.decode('utf-8')), n=1)
print ngram_inference(ngram_dict, "etchings of unknown arbiters")
print ngram_inference(ngram_dict_tour, "etchings of unknown arbiters")
print ngram_inference(ngram_dict, "im at hollywood blvd")
print ngram_inference(ngram_dict_tour, "im at hollywood blvd")
print ngram_inference(ngram_dict, "you are so awful")
print ngram_inference(ngram_dict_tour, "you are so awful")

9.88776211808e-26
9.17278857416e-22
9.88776211808e-26
9.17278857416e-22
9.88776211808e-26
9.17278857416e-22


In [68]:
# try some inference
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

training, testing = train_test_split(geotags_la[['in_la_user', 'tokens', 'time_of_day', 'day_of_week']].dropna(), 
                                     random_state=555, train_size=0.8)

training[:,1] = map(lambda x: x.decode('utf-8'), training[:,1])
testing[:,1] = map(lambda x: x.decode('utf-8'), testing[:,1])
#print training[0,0]
# train lg model features
n=1
training_model_tourist = get_ngram_model(training[training[:,0]==False,n])
training_model_native = get_ngram_model(training[training[:,0]==True,n])

# add probabilities from lg models
training_probs_tourist = np.array([ngram_inference(training_model_tourist,x,n) for x in training[:,1]], ndmin=2).T
training_probs_native = np.array([ngram_inference(training_model_native,x,n) for x in training[:,1]], ndmin=2).T
testing_probs_tourist = np.array([ngram_inference(training_model_tourist,x,n) for x in testing[:,1]], ndmin=2).T
testing_probs_native = np.array([ngram_inference(training_model_native,x,n) for x in testing[:,1]], ndmin=2).T

In [69]:
training_more = np.concatenate([training, training_probs_tourist - training_probs_native], axis=1)
testing_more = np.concatenate([testing, testing_probs_tourist - testing_probs_native], axis=1)

# extract and scale predictors
print training_more[:10,2:]
training_scaler = StandardScaler()
training_predictors = training_scaler.fit_transform(training_more[:,2:])
testing_predictors = training_scaler.transform(testing_more[:,2:])


[[3 3 3.3149618698275607e-91]
 [6 3 3.426322408368742e-54]
 [20 0 4.297185071857467e-87]
 [0 2 2.7316502014968842e-21]
 [20 2 9.36053392145069e-75]
 [18 1 1.2134060122376188e-70]
 [7 3 2.557248993155566e-95]
 [4 3 5.570441008648835e-83]
 [23 3 7.22096267951148e-79]
 [18 0 7.463538622167737e-42]]


In [70]:
# some kinda machine learning

# Naive Bayes, Random Forest
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sklearn.metrics


def try_model(training_predictors, training_response, testing_predictors, testing_response, the_model):
    the_model.fit(training_predictors, training_response)
    classfn = the_model.predict(testing_predictors)
    print "Number correct: ", (classfn==testing_response).sum()
    print "Baseline (training): ",  np.sum(training_response==True) / float(len(training_response))
    print "Percent correct (training): ", (the_model.predict(training_predictors)==
                                           training_response).sum() / float(len(training_response))
    print "Percent correct (testing): ", (classfn==testing_response).sum() / float(len(classfn))
    
    print sklearn.metrics.confusion_matrix(testing_response.astype(int), classfn.astype(int))


# try naid baxes)")
try_model(training_predictors, training[:,0], testing_predictors, testing[:,0], SVC())

Number correct:  1168
Baseline (training):  0.859409594096
Percent correct (training):  0.859409594096
Percent correct (testing):  0.861992619926
[[   0  187]
 [   0 1168]]


In [44]:
testing_predictors[:10,:]

array([[ 0.80503915, -1.66761238],
       [ 0.42626845,  0.54318072],
       [ 0.93129605,  0.54318072],
       [ 1.05755295, -1.66761238],
       [ 0.30001155, -1.66761238],
       [-1.46758504,  0.54318072],
       [ 1.05755295, -1.66761238],
       [-1.21507124,  1.28011175],
       [ 1.05755295, -0.19375032],
       [ 0.67878225,  1.28011175]])